From 507ecb3fd812abec6d79ec0846be3e5f9d126904 Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Sat, 30 May 2026 17:27:17 +0800 Subject: [PATCH] =?UTF-8?q?Refactor:=20holistic=20L2Perf=20=E2=86=92=20L2S?= =?UTF-8?q?wimlane=20rename=20+=20drop=20dead=20fields?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A holistic naming pass over the L2 swimlane subsystem: 1. Type names get a consistent `L2Swimlane{Writer}{Kind}{Layer}` shape. 2. Pool struct rename (BufferState → Pool) — captures the actual semantic (a pool of buffers, not "a buffer's state"). 3. ReadyQueueEntry::is_phase (uint32_t magic) becomes L2SwimlaneBufferKind enum class (uint32_t underlying — no ABI break). 4. a2a3 only: drop L2SwimlaneAicpuTaskPool::aicore_ring_ptr and mismatch_record_count, both dead since #878 / #921. 5. File names l2_perf_*.{h,cpp} renamed to l2_swimlane_*.{h,cpp}. == Renaming scheme == Records / buffers / pools follow `L2Swimlane{Writer}{Kind}{Layer}`: L2PerfRecord -> L2SwimlaneAicpuTaskRecord L2PerfAicoreRecord -> L2SwimlaneAicoreTaskRecord AicpuPhaseRecord -> L2SwimlaneAicpuPhaseRecord L2PerfBuffer -> L2SwimlaneAicpuTaskBuffer L2PerfAicoreBuffer -> L2SwimlaneAicoreTaskBuffer PhaseBuffer -> L2SwimlaneAicpuPhaseBuffer L2PerfBufferState -> L2SwimlaneAicpuTaskPool L2PerfAicoreBufferState -> L2SwimlaneAicoreTaskPool PhaseBufferState -> L2SwimlaneAicpuPhasePool L2PerfDataHeader -> L2SwimlaneDataHeader L2PerfLevel -> L2SwimlaneLevel L2PerfFreeQueue -> L2SwimlaneFreeQueue L2PerfModule -> L2SwimlaneModule L2PerfCollector -> L2SwimlaneCollector AicpuPhaseId -> L2SwimlaneAicpuPhaseId AicpuPhaseHeader -> L2SwimlaneAicpuPhaseHeader AicoreRotation -> L2SwimlaneAicoreRotation AicoreLocalState -> L2SwimlaneAicoreLocalState L2PerfAicoreRing (a5) -> L2SwimlaneAicoreRing Enums (uint32_t underlying — wire format preserved): enum class L2SwimlaneBufferKind { AicpuTask = 0, AicpuPhase = 1, AicoreTask = 2, }; enum class ProfBufferType { AICPU_TASK = 0, AICPU_PHASE = 1, AICORE_TASK = 2 }; Functions / globals / statics: l2_perf_* -> l2_swimlane_*, complete_record -> complete_task, flush_buffers -> flush. KernelArgs cross-platform fields: l2_perf_data_base -> l2_swimlane_base aicore_ring_addr -> l2_swimlane_aicore_rotation_table Files: include/{common,host,aicpu,aicore}/l2_perf_*.{h,cpp} -> include/{common,host,aicpu,aicore}/l2_swimlane_*.{h,cpp} Output artifact: l2_perf_records.json -> l2_swimlane_records.json == Dead-field drop (a2a3 only) == L2SwimlaneAicpuTaskPool::aicore_ring_ptr — removed L2SwimlaneAicpuTaskPool::mismatch_record_count — removed Both fields have been dead since: - #878 moved AICore writes to a per-core L2SwimlaneAicoreTaskBuffer pool with its own state (L2SwimlaneAicoreTaskPool). - #921 moved the rotation-table fill from host to AICPU. - mismatch_record_count was last written before #878. Pad adjusted to keep sizeof(L2SwimlaneAicpuTaskPool) == 192 (static_assert remains green). Host reconcile() loop simplifies to total/dropped only. a5 keeps both fields — its AICore pipeline still uses the legacy staging-ring design where they are live. == Behaviour == Pure rename plus the dead-field drop. No struct layout change beyond removing 12 bytes that nobody read or wrote. uint32_t-underlying enums preserve wire format. == Test plan == - All four platform variants (a2a3, a2a3sim, a5, a5sim) build clean - tests/st/.../l2_swimlane pass on a2a3sim - Same ST passes on a2a3 device 1 - a5sim smoke (spmd_basic) passes --- conftest.py | 2 +- docs/dfx/dep_gen.md | 10 +- docs/dfx/l2-swimlane-profiling.md | 168 +++++----- docs/dfx/pmu-profiling.md | 2 +- docs/dfx/tensor-dump.md | 2 +- docs/hardware/cache-coherency.md | 12 +- docs/profiling-framework.md | 30 +- docs/profiling-name-map.md | 10 +- docs/sim_multi_device_isolation.md | 2 +- docs/testing.md | 6 +- .../workers/l2/vector_add/test_run_timing.py | 2 +- scope_stats/scope_stats.jsonl | 3 + simpler_setup/scene_test.py | 10 +- simpler_setup/tools/README.md | 48 +-- simpler_setup/tools/deps_to_graph.py | 10 +- .../tools/sched_overhead_analysis.py | 67 ++-- simpler_setup/tools/swimlane_converter.py | 53 +-- .../include/aicore/aicore_profiling_state.h | 22 +- ...icore.h => l2_swimlane_collector_aicore.h} | 39 +-- .../include/aicpu/dep_gen_collector_aicpu.h | 2 +- ..._aicpu.h => l2_swimlane_collector_aicpu.h} | 104 +++--- src/a2a3/platform/include/common/dep_gen.h | 4 +- .../platform/include/common/kernel_args.h | 14 +- ...rf_profiling.h => l2_swimlane_profiling.h} | 292 ++++++++--------- .../platform/include/common/platform_config.h | 12 +- .../platform/include/common/pmu_profiling.h | 8 +- .../platform/include/common/scope_stats.h | 2 +- .../platform/include/common/tensor_dump.h | 2 +- .../platform/include/host/dep_gen_collector.h | 2 +- ...rf_collector.h => l2_swimlane_collector.h} | 199 ++++++------ .../profiling_common/buffer_pool_manager.h | 4 +- .../host/profiling_common/profiler_base.h | 14 +- src/a2a3/platform/onboard/aicore/kernel.cpp | 34 +- src/a2a3/platform/onboard/aicpu/kernel.cpp | 6 +- src/a2a3/platform/onboard/host/CMakeLists.txt | 2 +- .../platform/onboard/host/device_runner.cpp | 23 +- .../platform/onboard/host/device_runner.h | 10 +- src/a2a3/platform/sim/aicore/inner_kernel.h | 4 +- src/a2a3/platform/sim/aicore/kernel.cpp | 39 +-- src/a2a3/platform/sim/host/CMakeLists.txt | 2 +- src/a2a3/platform/sim/host/device_runner.cpp | 58 ++-- src/a2a3/platform/sim/host/device_runner.h | 22 +- ...pu.cpp => l2_swimlane_collector_aicpu.cpp} | 304 ++++++++++-------- ...ollector.cpp => l2_swimlane_collector.cpp} | 230 +++++++------ .../aicore/aicore_executor.cpp | 22 +- .../host_build_graph/aicpu/aicpu_executor.cpp | 77 ++--- .../host_build_graph/runtime/runtime.h | 2 +- .../aicore/aicore_executor.cpp | 32 +- .../aicpu/aicpu_executor.cpp | 12 +- .../docs/profiling_levels.md | 26 +- .../host/dep_gen_replay.h | 2 +- .../runtime/pto_orchestrator.cpp | 54 ++-- .../runtime/pto_orchestrator.h | 6 +- .../runtime/runtime.h | 2 +- .../runtime/scheduler/scheduler_cold_path.cpp | 110 ++++--- .../scheduler/scheduler_completion.cpp | 28 +- .../runtime/scheduler/scheduler_context.h | 12 +- .../runtime/scheduler/scheduler_dispatch.cpp | 112 +++---- .../runtime/scheduler/scheduler_types.h | 6 +- .../include/aicore/aicore_profiling_state.h | 12 +- ...icore.h => l2_swimlane_collector_aicore.h} | 27 +- ..._aicpu.h => l2_swimlane_collector_aicpu.h} | 82 ++--- src/a5/platform/include/common/kernel_args.h | 22 +- ...rf_profiling.h => l2_swimlane_profiling.h} | 220 +++++++------ .../platform/include/common/platform_config.h | 14 +- .../platform/include/common/pmu_profiling.h | 2 +- src/a5/platform/include/common/scope_stats.h | 2 +- ...rf_collector.h => l2_swimlane_collector.h} | 151 ++++----- .../profiling_common/buffer_pool_manager.h | 10 +- .../host/profiling_common/profiler_base.h | 18 +- src/a5/platform/onboard/aicore/kernel.cpp | 22 +- src/a5/platform/onboard/aicpu/kernel.cpp | 4 +- src/a5/platform/onboard/host/CMakeLists.txt | 2 +- .../platform/onboard/host/device_runner.cpp | 26 +- src/a5/platform/onboard/host/device_runner.h | 16 +- src/a5/platform/sim/aicore/inner_kernel.h | 4 +- src/a5/platform/sim/aicore/kernel.cpp | 24 +- src/a5/platform/sim/host/CMakeLists.txt | 2 +- src/a5/platform/sim/host/device_runner.cpp | 51 +-- src/a5/platform/sim/host/device_runner.h | 22 +- ...pu.cpp => l2_swimlane_collector_aicpu.cpp} | 248 +++++++------- .../platform/src/aicpu/tensor_dump_aicpu.cpp | 2 +- ...ollector.cpp => l2_swimlane_collector.cpp} | 195 +++++------ src/a5/platform/src/host/pmu_collector.cpp | 2 +- .../aicore/aicore_executor.cpp | 12 +- .../host_build_graph/aicpu/aicpu_executor.cpp | 83 ++--- .../aicore/aicore_executor.cpp | 12 +- .../aicpu/aicpu_executor.cpp | 12 +- .../docs/profiling_levels.md | 26 +- .../runtime/pto_orchestrator.cpp | 74 ++--- .../runtime/pto_orchestrator.h | 6 +- .../runtime/scheduler/scheduler_cold_path.cpp | 117 +++---- .../scheduler/scheduler_completion.cpp | 28 +- .../runtime/scheduler/scheduler_context.h | 12 +- .../runtime/scheduler/scheduler_dispatch.cpp | 120 +++---- .../runtime/scheduler/scheduler_types.h | 6 +- .../onboard/host/device_runner_base.cpp | 10 +- .../onboard/host/device_runner_base.h | 22 +- src/common/task_interface/call_config.h | 4 +- .../dfx/dep_gen/test_dep_gen.py | 2 +- .../dfx/l2_swimlane/_swimlane_validate.py | 14 +- .../dfx/l2_swimlane/test_l2_swimlane.py | 4 +- 102 files changed, 2111 insertions(+), 2027 deletions(-) create mode 100644 scope_stats/scope_stats.jsonl rename src/a2a3/platform/include/aicore/{l2_perf_collector_aicore.h => l2_swimlane_collector_aicore.h} (75%) rename src/a2a3/platform/include/aicpu/{l2_perf_collector_aicpu.h => l2_swimlane_collector_aicpu.h} (66%) rename src/a2a3/platform/include/common/{l2_perf_profiling.h => l2_swimlane_profiling.h} (68%) rename src/a2a3/platform/include/host/{l2_perf_collector.h => l2_swimlane_collector.h} (62%) rename src/a2a3/platform/src/aicpu/{l2_perf_collector_aicpu.cpp => l2_swimlane_collector_aicpu.cpp} (68%) rename src/a2a3/platform/src/host/{l2_perf_collector.cpp => l2_swimlane_collector.cpp} (82%) rename src/a5/platform/include/aicore/{l2_perf_collector_aicore.h => l2_swimlane_collector_aicore.h} (73%) rename src/a5/platform/include/aicpu/{l2_perf_collector_aicpu.h => l2_swimlane_collector_aicpu.h} (67%) rename src/a5/platform/include/common/{l2_perf_profiling.h => l2_swimlane_profiling.h} (69%) rename src/a5/platform/include/host/{l2_perf_collector.h => l2_swimlane_collector.h} (66%) rename src/a5/platform/src/aicpu/{l2_perf_collector_aicpu.cpp => l2_swimlane_collector_aicpu.cpp} (64%) rename src/a5/platform/src/host/{l2_perf_collector.cpp => l2_swimlane_collector.cpp} (80%) diff --git a/conftest.py b/conftest.py index 6385a8f67..4c3cac374 100644 --- a/conftest.py +++ b/conftest.py @@ -572,7 +572,7 @@ def sort_key(item): items.sort(key=sort_key) # L3 perf collection is not supported yet: a single L3 case forks N chip-processes - # that all write l2_perf_records_.json to the same directory with + # that all write l2_swimlane_records_.json to the same directory with # second-precision timestamps, so they trample each other. Block the # combination up front; waiting for a proper device-id-in-filename fix. if config.getoption("--enable-l2-swimlane", default=0): diff --git a/docs/dfx/dep_gen.md b/docs/dfx/dep_gen.md index c1e83f8d4..1fda52e71 100644 --- a/docs/dfx/dep_gen.md +++ b/docs/dfx/dep_gen.md @@ -6,7 +6,7 @@ The swimlane profiler's per-task `fanout[]` array is the obvious place to read "which tasks did task X feed into?" — but it is **structurally incomplete on real hardware**. -Each producer task carries its own `L2PerfRecord.fanout[RUNTIME_MAX_FANOUT]`, +Each producer task carries its own `L2SwimlaneAicpuTaskRecord.fanout[RUNTIME_MAX_FANOUT]`, populated by the AICPU scheduler at the moment it wires a downstream consumer. If a producer has already finished and transitioned to `PTO2_TASK_COMPLETED` by the time a later submit wants to register a @@ -84,7 +84,7 @@ The `--enable-l2-swimlane` flag is independent but recommended in pair because: - `deps.json` is the dep_gen artifact. -- `l2_perf_records.json` (from swimlane) is the timing artifact; +- `l2_swimlane_records.json` (from swimlane) is the timing artifact; `merged_swimlane.json` (the Perfetto trace) uses `deps.json` for dependency arrows when both files exist. - The "fanout ⊆ deps" validation gate fires only when both files are @@ -262,7 +262,7 @@ Node visual encoding (legend top-right of the rendered HTML): | Gray dashed note | alloc — task from `alloc_tensors` (got a task_id, references downstream via `owner_task_id`, but never dispatched a kernel so has no perf record) | Labels read as `(ring, local) · func_name · core_type-implicit-via-shape`. -When a colocated `l2_perf_records.json` is present the func_id is enriched +When a colocated `l2_swimlane_records.json` is present the func_id is enriched with the kernel name via the sibling `name_map_.json` (written by SceneTest's `_dump_name_map`). @@ -288,11 +288,11 @@ sources / args / slices, so the raw `edges[]` count is a superset of the underlying task-pair count. `deps.json` (projected) is a **superset** of the fanout edges in -`l2_perf_records.json`: +`l2_swimlane_records.json`: | Edge source | Captures | Drops on race? | | ----------- | -------- | -------------- | -| `task.fanout[]` (L2PerfRecord) | Successors known at producer-retire time | **Yes** — sealed when producer retires | +| `task.fanout[]` (L2SwimlaneAicpuTaskRecord) | Successors known at producer-retire time | **Yes** — sealed when producer retires | | `deps.json` (this feature) | Every consumer → producer reachable via tensormap / explicit_deps | No — replay sees every submit | `tests/st/a2a3/tensormap_and_ringbuffer/dep_gen_capture/test_dep_gen_capture.py` diff --git a/docs/dfx/l2-swimlane-profiling.md b/docs/dfx/l2-swimlane-profiling.md index 7ef434b8e..43255711a 100644 --- a/docs/dfx/l2-swimlane-profiling.md +++ b/docs/dfx/l2-swimlane-profiling.md @@ -45,7 +45,7 @@ available. `g_orch_*_cycle` counters — that's where you go for "which sub-step dominates overall"; the per-submit record covers "which submit was slow". -- **Standard outputs** — raw `l2_perf_records.json`, plus a +- **Standard outputs** — raw `l2_swimlane_records.json`, plus a Perfetto-loadable `merged_swimlane_*.json` produced by `swimlane_converter`. @@ -91,13 +91,13 @@ python tests/st//test_.py -p -d 0 --enable-l2-swimlane The flag sets `CallConfig::enable_l2_swimlane` to the chosen level. The host then allocates the per-core / per-thread shared region and publishes its base address through -`kernel_args.l2_perf_data_base`. AICore writes timing into +`kernel_args.l2_swimlane_data_base`. AICore writes timing into per-task WIP slots; AICPU commits the records on FIN. Per-task dispatch/finish timestamps and fanout are recorded only at level >= 2, scheduler phase records only at level >= 3, and orchestrator phase records only at level >= 4. -The JSON output `"l2_perf_level"` field is the captured perf_level: +The JSON output `"l2_swimlane_level"` field is the captured perf_level: `1` = AICore timing only, `2` = +dispatch/fanout, `3` = +scheduler phases, `4` = +orchestrator phases. @@ -114,7 +114,7 @@ runs): ```text / -├── l2_perf_records.json # raw runtime output +├── l2_swimlane_records.json # raw runtime output ├── name_map_.json # optional func_id → name mapping └── merged_swimlane.json # Perfetto trace (added by converter) ``` @@ -122,7 +122,7 @@ runs): Filenames are fixed (no per-file timestamp) — the directory is the per-task uniqueness boundary. -`l2_perf_records.json` carries the raw records — this is the file +`l2_swimlane_records.json` carries the raw records — this is the file you pass to `swimlane_converter`. Important fields per task: | Field | Meaning | @@ -157,17 +157,17 @@ unassigned). and produces a per-function task-execution summary: ```bash -# Auto-detects the latest outputs/*/l2_perf_records.json +# Auto-detects the latest outputs/*/l2_swimlane_records.json python -m simpler_setup.tools.swimlane_converter # Pin to a specific case + add func_id → name mapping python -m simpler_setup.tools.swimlane_converter \ - outputs/_/l2_perf_records.json \ + outputs/_/l2_swimlane_records.json \ --func-names outputs/_/name_map_.json # Custom output path python -m simpler_setup.tools.swimlane_converter \ - outputs/_/l2_perf_records.json -o my_trace.json + outputs/_/l2_swimlane_records.json -o my_trace.json ``` The output is `outputs/_/merged_swimlane.json` (or your @@ -244,70 +244,70 @@ What the swimlane shows: ### 5.1 Common interfaces -`kernel_args.l2_perf_data_base` is the single device-side handle +`kernel_args.l2_swimlane_data_base` is the single device-side handle host publishes for the run. The shared region carries a fixed -`L2PerfDataHeader` plus per-core / per-thread state (same struct +`L2SwimlaneDataHeader` plus per-core / per-thread state (same struct shape on both architectures): ```text -L2PerfDataHeader (host init, device R/W) +L2SwimlaneDataHeader (host init, device R/W) ├── queues [MAX_AICPU_THREADS][READYQUEUE_SIZE] ├── queue_heads / queue_tails (per-thread) └── num_cores -L2PerfBufferState[num_cores] (per-core AICPU pool state) +L2SwimlaneAicpuTaskPool[num_cores] (per-core AICPU pool state) ├── free_queue {buffer_ptrs[SLOT_COUNT], head, tail} -├── current_buf_ptr (AICPU active L2PerfBuffer*) +├── current_buf_ptr (AICPU active L2SwimlaneAicpuTaskBuffer*) ├── aicore_ring_ptr (legacy; kept for ABI continuity) ├── total_record_count ├── dropped_record_count └── mismatch_record_count (legacy; no longer written) -L2PerfAicoreBufferState[num_cores] (per-core AICore pool state) +L2SwimlaneAicoreTaskPool[num_cores] (per-core AICore pool state) ├── rotation {current_buf_ptr, generation} (AICPU writes, AICore reads │ — cache-line independent) ├── free_queue {buffer_ptrs[SLOT_COUNT], head, tail} ├── total_record_count / dropped_record_count └── current_buf_seq -[L2PerfAicoreBuffer × PLATFORM_AICORE_BUFFERS_PER_CORE per core] -└── L2PerfAicoreRecord records[PLATFORM_AICORE_BUFFER_SIZE] (1024 records, 32B each) +[L2SwimlaneAicoreTaskBuffer × PLATFORM_AICORE_BUFFERS_PER_CORE per core] +└── L2SwimlaneAicoreTaskRecord records[PLATFORM_AICORE_BUFFER_SIZE] (1024 records, 32B each) -[AicpuPhaseHeader + PhaseBufferState[num_threads]] (optional) +[L2SwimlaneAicpuPhaseHeader + L2SwimlaneAicpuPhasePool[num_threads]] (optional) ├── magic / num_sched_threads ├── core_to_thread[] (core_id → scheduler thread index) -└── per-thread phase buffers (PhaseBufferState aliases L2PerfBufferState) +└── per-thread phase buffers (L2SwimlaneAicpuPhasePool aliases L2SwimlaneAicpuTaskPool) ``` The records themselves are identical across architectures: -- `L2PerfRecord` — per-task AICPU-owned fields (task_id, dispatch_time, +- `L2SwimlaneAicpuTaskRecord` — per-task AICPU-owned fields (task_id, dispatch_time, finish_time, func_id, core_type, reg_task_id), 64-byte aligned. `reg_task_id` is the join key against the matching AICore record. -- `L2PerfAicoreRecord` — slim AICore-only record (start, end, task_id), +- `L2SwimlaneAicoreTaskRecord` — slim AICore-only record (start, end, task_id), 32 bytes; AICore writes one per task into its currently-active per-core buffer. -- `AicpuPhaseRecord` — per-iteration scheduler / orchestrator +- `L2SwimlaneAicpuPhaseRecord` — per-iteration scheduler / orchestrator phase, 40 bytes. This is the key reason a single `swimlane_converter` consumes both architectures' output unchanged. Orchestrator timing is carried -by per-submit `AicpuPhaseRecord` entries (ORCH_SUBMIT, folded from +by per-submit `L2SwimlaneAicpuPhaseRecord` entries (ORCH_SUBMIT, folded from the historical per-sub-step records); there is no separate shared-memory aggregate. The run-window envelope is emitted to device log via `LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…"`. **Producer/consumer protocol on AICore (AICore-as-producer with rotation).** -AICore writes a slim `L2PerfAicoreRecord` into its currently-active per-core -`L2PerfAicoreBuffer` at `records[slot_within_buf++]`. The active buffer is -published via a per-core `AicoreRotation` cache line (`current_buf_ptr` + +AICore writes a slim `L2SwimlaneAicoreTaskRecord` into its currently-active per-core +`L2SwimlaneAicoreTaskBuffer` at `records[slot_within_buf++]`. The active buffer is +published via a per-core `L2SwimlaneAicoreRotation` cache line (`current_buf_ptr` + `generation`); AICore `dcci`'s it per task — cheap relative to the baseline `dcci(payload, ENTIRE_DATA_CACHE)` it already pays per task. AICPU drives rotation: immediately before each `write_reg(DATA_MAIN_BASE)` for task `K`, if `K % PLATFORM_AICORE_BUFFER_SIZE == 0`, AICPU enqueues the current buffer to the per-thread ready queue (kind `is_phase=2`), -pops the next from `L2PerfAicoreBufferState::free_queue`, and bumps -`AicoreRotation::generation`. AICore detects the bumped generation on +pops the next from `L2SwimlaneAicoreTaskPool::free_queue`, and bumps +`L2SwimlaneAicoreRotation::generation`. AICore detects the bumped generation on its next task's `dcci`, refreshes its local cache, and resets its slot counter to 0. @@ -334,18 +334,18 @@ sched overhead per session as price for unbounded session length). `halHostRegister` maps device memory into host virtual address space so the host can read device buffers directly. -`L2PerfCollector` runs two background threads on top of a -[`BufferPoolManager`](../src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h): +`L2SwimlaneCollector` runs two background threads on top of a +[`BufferPoolManager`](../src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h): a mgmt thread that polls SPSC ready queues and recycles full buffers **while kernels are still executing**, plus a poll thread that drains the L2 hand-off queue into `on_buffer_collected`. -`L2PerfModule` declares two buffer kinds going through one ready +`L2SwimlaneModule` declares two buffer kinds going through one ready queue per AICPU thread: -- **kind 0**: per-core `L2PerfBuffer` (task records). -- **kind 1**: per-thread `PhaseBuffer` (scheduler / orchestrator +- **kind 0**: per-core `L2SwimlaneAicpuTaskBuffer` (task records). +- **kind 1**: per-thread `L2SwimlaneAicpuPhaseBuffer` (scheduler / orchestrator phase records). The `is_phase` flag on each `ReadyQueueEntry` picks between them. @@ -355,7 +355,7 @@ and TensorDump are single-kind. ```text HOST DEVICE ┌──────────────────────────┐ ┌──────────────────────────┐ -│ L2PerfCollector │ │ AICPU + AICore │ +│ L2SwimlaneCollector │ │ AICPU + AICore │ │ │ │ │ │ initialize(prefix) │ alloc + │ AICore on task end: │ │ rtMalloc + halRegister │──register────>│ write timing into │ @@ -365,14 +365,14 @@ and TensorDump are single-kind. │ start(tf) │ │ commit ring slot → │ │ ┌────────────────────┐ │ SPSC ready │ records[count], │ │ │ mgmt thread │ │ queues │ fill func_id / │ -│ │ (BufferPool driver)│ │<──L2Perf──────│ dispatch / finish / │ +│ │ (BufferPool driver)│ │<──L2Swimlane──────│ dispatch / finish / │ │ │ poll ready queue │<┼──+ Phase─────<│ fanout; rotate buffer │ │ │ recycle buffers │─┼──free queue──>│ when full │ │ └────────────────────┘ │ │ AICPU scheduler thread: │ │ ┌────────────────────┐ │ │ per-loop-iter: │ │ │ poll thread │ │ │ write AicpuPhase- │ │ │ reads via host │ │ shared mem │ Record into │ -│ │ mapping; copies │<┼──mapping─────<│ PhaseBuffer │ +│ │ mapping; copies │<┼──mapping─────<│ L2SwimlaneAicpuPhaseBuffer │ │ │ to host vectors │ │ │ │ │ └────────────────────┘ │ │ │ │ stop() │ │ │ @@ -380,16 +380,16 @@ and TensorDump are single-kind. │ read_phase_header_metadata() │ │ │ reconcile_counters() │ │ │ │ export_swimlane_json() │ │ │ -│ → l2_perf_records.json │ │ │ +│ → l2_swimlane_records.json │ │ │ └──────────────────────────┘ └──────────────────────────┘ ``` **Lifecycle** (`device_runner.cpp`): ```text -init_l2_perf() - l2_perf_collector_.initialize(num_aicore, ..., output_prefix_) - kernel_args_.args.l2_perf_data_base = l2_perf_collector_.get_l2_perf_shm_device_ptr() +init_l2_swimlane() + l2_swimlane_collector_.initialize(num_aicore, ..., output_prefix_) + kernel_args_.args.l2_swimlane_data_base = l2_swimlane_collector_.get_l2_swimlane_shm_device_ptr() start(tf) ← spawn mgmt + poll threads launch AICPU / AICore rtStreamSynchronize @@ -400,16 +400,16 @@ reconcile_counters() ← three-bucket accounting for both PERF and PHASE pools (total / collected / dropped); any non-zero current_buf_ptr is a flush bug -export_swimlane_json() ← writes /l2_perf_records.json +export_swimlane_json() ← writes /l2_swimlane_records.json finalize(unregister, free) ``` -[`L2PerfCollector`](../src/a2a3/platform/include/host/l2_perf_collector.h) +[`L2SwimlaneCollector`](../src/a2a3/platform/include/host/l2_swimlane_collector.h) on a2a3 inherits from -[`profiling_common::ProfilerBase`](../src/a2a3/platform/include/host/profiling_common/profiler_base.h): +[`profiling_common::ProfilerBase`](../src/a2a3/platform/include/host/profiling_common/profiler_base.h): the base class owns the mgmt thread, the poll thread, and the -`BufferPoolManager` they share. `L2PerfCollector` -supplies the L2-specific pieces — the `L2PerfModule` trait +`BufferPoolManager` they share. `L2SwimlaneCollector` +supplies the L2-specific pieces — the `L2SwimlaneModule` trait (notably `kBufferKinds = 2` and `kind_of()`), `initialize` that allocates and pre-fills both kinds of free queues, an `on_buffer_collected` callback that branches on @@ -423,8 +423,8 @@ framework reference. ### 5.3 a5 — same framework, host-shadow transport -a5's `L2PerfCollector` derives from -`ProfilerBase` and shares the +a5's `L2SwimlaneCollector` derives from +`ProfilerBase` and shares the mgmt + poll thread structure with a2a3. The single behavioral deviation from §5.2 is the **transport channel**: a5 has no `halHostRegister`, so each device buffer is paired with a @@ -432,32 +432,32 @@ host-shadow `malloc()` and the mgmt loop synchronizes the two via `profiling_copy.h` (`rtMemcpy` onboard, plain `memcpy` in sim). The AICore-side write target is a per-core, **stable** -`L2PerfAicoreRing` (`dual_issue_slots[PLATFORM_L2_AICORE_RING_SIZE]`) +`L2SwimlaneAicoreRing` (`dual_issue_slots[PLATFORM_L2_AICORE_RING_SIZE]`) allocated once by the host and addressed via -`L2PerfBufferState::aicore_ring_ptr` (AICPU side) and -`KernelArgs::aicore_l2_perf_ring_addrs[block_idx]` forwarded into -`set_aicore_l2_perf_ring()` by `KERNEL_ENTRY` (AICore side). The ring +`L2SwimlaneAicpuTaskPool::aicore_ring_ptr` (AICPU side) and +`KernelArgs::aicore_l2_swimlane_ring_addrs[block_idx]` forwarded into +`set_aicore_l2_swimlane_ring()` by `KERNEL_ENTRY` (AICore side). The ring address never changes during a run, so AICore's write address is -decoupled from the AICPU's rotating `L2PerfBuffer`. Buffer rotation is -internal to `l2_perf_aicpu_complete_record` when `records[count]` hits +decoupled from the AICPU's rotating `L2SwimlaneAicpuTaskBuffer`. Buffer rotation is +internal to `l2_swimlane_aicpu_complete_task` when `records[count]` hits `PLATFORM_PROF_BUFFER_SIZE`. The runtime `Handshake` carries no profiling fields. The framework's `MemoryOps` therefore carries five callbacks on a5 (`alloc` / `reg` / `free_` / `copy_to_device` / `copy_from_device`); the mgmt loop mirrors the entire shm region -(`L2PerfDataHeader` + per-core `L2PerfBufferState` + per-thread -`PhaseBufferState`) device → host at the top of every tick, then +(`L2SwimlaneDataHeader` + per-core `L2SwimlaneAicpuTaskPool` + per-thread +`L2SwimlaneAicpuPhasePool`) device → host at the top of every tick, then pushes back only the fields host actually modified (advanced `queue_heads[q]`, refilled `free_queue.tail` and `buffer_ptrs[slot]`) via `BufferPoolManager::write_range_to_device`. The bulk `mirror_shm_to_device` is deliberately **not** called from the mgmt loop: it would race with AICPU writes to device-only fields (`current_buf_ptr`, `total/dropped/mismatch` counters, -`queue_tails`, `free_queue.head`, `AicpuPhaseHeader::magic`, +`queue_tails`, `free_queue.head`, `L2SwimlaneAicpuPhaseHeader::magic`, `core_to_thread[]`) and roll them back to whatever the host shadow held at the start of the tick. Per-buffer -payloads (`L2PerfBuffer` / `PhaseBuffer`) are pulled on demand +payloads (`L2SwimlaneAicpuTaskBuffer` / `L2SwimlaneAicpuPhaseBuffer`) are pulled on demand inside `ProfilerAlgorithms::process_entry` after a popped ready-entry resolves to its host shadow. `BufferPoolManager`'s `release_owned_buffers` frees the device pointer via the @@ -466,14 +466,14 @@ collector's `release_fn` and the paired shadow via `std::free()`. ```text HOST DEVICE ┌──────────────────────────┐ ┌──────────────────────────┐ -│ L2PerfCollector │ │ AICPU + AICore │ +│ L2SwimlaneCollector │ │ AICPU + AICore │ │ : ProfilerBase<...> │ │ │ │ │ │ │ │ initialize() │ alloc + reg │ AICore on task end: │ │ rtMalloc shm │──+ shadow────>│ write timing into │ -│ per-core L2PerfBuffer │ memset 0 │ per-core ring slot │ +│ per-core L2SwimlaneAicpuTaskBuffer │ memset 0 │ per-core ring slot │ │ per-core AicoreRing │ + push 0s │ dual_issue_slots[ │ -│ per-thread PhaseBuffer │ │ task_id & 1] │ +│ per-thread L2SwimlaneAicpuPhaseBuffer │ │ task_id & 1] │ │ register_mapping(s) │ │ │ │ set_memory_context │ │ AICPU on FIN: │ │ │ │ read ring slot → │ @@ -515,31 +515,31 @@ collector's `release_fn` and the paired shadow via `std::free()`. **Lifecycle** (`device_runner.cpp`): ```text -init_l2_perf() - l2_perf_collector_.initialize(num_aicore, ..., output_prefix_) - kernel_args_.args.l2_perf_data_base = l2_perf_collector_.get_l2_perf_setup_device_ptr() - kernel_args_.args.aicore_l2_perf_ring_addrs = - l2_perf_collector_.get_aicore_ring_addrs_device_ptr() -l2_perf_collector_.start(thread_factory) ← mgmt + poll threads +init_l2_swimlane() + l2_swimlane_collector_.initialize(num_aicore, ..., output_prefix_) + kernel_args_.args.l2_swimlane_data_base = l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr() + kernel_args_.args.aicore_l2_swimlane_ring_addrs = + l2_swimlane_collector_.get_aicore_ring_addrs_device_ptr() +l2_swimlane_collector_.start(thread_factory) ← mgmt + poll threads launch AICPU / AICore rtStreamSynchronize -l2_perf_collector_.stop() ← join mgmt + poll, drain final batch -l2_perf_collector_.read_phase_header_metadata() -l2_perf_collector_.reconcile_counters() ← sanity-check + 3-bucket cross-check -l2_perf_collector_.export_swimlane_json() -l2_perf_collector_.finalize() +l2_swimlane_collector_.stop() ← join mgmt + poll, drain final batch +l2_swimlane_collector_.read_phase_header_metadata() +l2_swimlane_collector_.reconcile_counters() ← sanity-check + 3-bucket cross-check +l2_swimlane_collector_.export_swimlane_json() +l2_swimlane_collector_.finalize() ``` -[`L2PerfCollector`](../src/a5/platform/include/host/l2_perf_collector.h) +[`L2SwimlaneCollector`](../src/a5/platform/include/host/l2_swimlane_collector.h) on a5 inherits the same CRTP base ([`profiling_common::ProfilerBase`](../src/a5/platform/include/host/profiling_common/profiler_base.h)) as a2a3 and parameterizes [`BufferPoolManager`](../src/a5/platform/include/host/profiling_common/buffer_pool_manager.h) -with `L2PerfModule` (`kBufferKinds = 2`). The only a5-specific +with `L2SwimlaneModule` (`kBufferKinds = 2`). The only a5-specific glue is the 5-callback `MemoryOps` and the per-tick shm mirror. -a5's per-thread AICPU flush hooks (`l2_perf_aicpu_flush_buffers` / -`l2_perf_aicpu_flush_phase_buffers`) are the only data path on the +a5's per-thread AICPU flush hooks (`l2_swimlane_aicpu_flush` / +`l2_swimlane_aicpu_flush_phase_buffers`) are the only data path on the records side — host never reads from `current_buf_ptr` to recover records. `reconcile_counters` is purely passive: it logs an error if any `current_buf_ptr` is non-zero with a non-empty buffer (a @@ -551,13 +551,13 @@ PHASE), same shape as a2a3. | Aspect | a2a3 | a5 | | ------ | ---- | -- | -| Record shape | identical (`L2PerfRecord` / `AicpuPhaseRecord`) | | +| Record shape | identical (`L2SwimlaneAicpuTaskRecord` / `L2SwimlaneAicpuPhaseRecord`) | | | AICore WIP-slot protocol | identical | | | AICPU commit on FIN | identical | | | Buffer model | rotating pool (free + ready queues) per kind | identical | | Ready queue | per-AICPU-thread, multiplexes PERF + PHASE via `is_phase` | identical | | Host threads | mgmt + poll, streams during execution | identical | -| Host-class shape | `ProfilerBase` (`kBufferKinds = 2`) | identical | +| Host-class shape | `ProfilerBase` (`kBufferKinds = 2`) | identical | | Host transport | `halHostRegister` shared memory | host-shadow `malloc` + per-tick `rtMemcpy`/`memcpy` | | `MemoryOps` callbacks | 3 (`alloc`, `reg`, `free_`) | 5 (+ `copy_to_device`, `copy_from_device`) | | `reconcile_counters` | passive cross-check (collected + dropped + mismatch == device_total) | identical | @@ -577,7 +577,7 @@ When enabled, the dominant per-task overhead is: ring buffer plus a few metadata fields. Per scheduler-loop iteration, AICPU also writes a 32-byte -`AicpuPhaseRecord` per phase (4 phases × 32 B = 128 B per +`L2SwimlaneAicpuPhaseRecord` per phase (4 phases × 40 B = 160 B per iteration). Both architectures drain buffers concurrently with execution via the mgmt + poll thread pair; a5 additionally pays per-tick `rtMemcpy`/`memcpy` round-trips to keep the host shadow in @@ -607,7 +607,7 @@ benchmark is not perturbed. ### 7.2 a5 -- Each per-core `L2PerfBuffer` and per-thread `PhaseBuffer` is +- Each per-core `L2SwimlaneAicpuTaskBuffer` and per-thread `L2SwimlaneAicpuPhaseBuffer` is fixed-size. Tasks past `PLATFORM_PROF_BUFFER_SIZE` per core (and phases past `PLATFORM_PHASE_RECORDS_PER_THREAD` per thread) are silently dropped via AICPU early return; the host surfaces the @@ -627,7 +627,7 @@ benchmark is not perturbed. ## 8. FAQ / Debug Guide -**No `l2_perf_records.json` produced.** Check that +**No `l2_swimlane_records.json` produced.** Check that `--enable-l2-swimlane` was passed. Verify `` exists in the run log; if `--rounds > 1`, only the first round records. @@ -636,7 +636,7 @@ automatically after a SceneTest with `--enable-l2-swimlane`; if it did not, run it manually: ```bash -python -m simpler_setup.tools.swimlane_converter outputs/_/l2_perf_records.json +python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json ``` **Tasks show as `func_` instead of human names.** The @@ -652,13 +652,13 @@ because the buffer pool ran out. On a2a3 check **`current_buf_ptr` non-empty at finalize on a2a3.** The host logs this as ERROR and does not recover. AICPU did not flush its -active L2 perf buffer at run end. Check the AICPU flush path runs +active L2 swimlane buffer at run end. Check the AICPU flush path runs for every thread that produced records. **Phase records empty.** Either the runtime did not emit phase data (only `tensormap_and_ringbuffer` does, and only when -`AicpuPhaseHeader::magic == AICPU_PHASE_MAGIC`), or the host's -`AicpuPhaseHeader` was not initialized. Verify the runtime sets +`L2SwimlaneAicpuPhaseHeader::magic == L2_SWIMLANE_AICPU_PHASE_MAGIC`), or the host's +`L2SwimlaneAicpuPhaseHeader` was not initialized. Verify the runtime sets the magic in its scheduler init path. **`dispatch_time_us` < `finish_time_us` mismatch.** Verify the runtime @@ -670,7 +670,7 @@ wrote the WIP slot but AICPU never committed. **Scheduler-overhead deep-dive missing from converter output.** The converter runs `sched_overhead_analysis` only when a device log is resolvable. Pass `-d ` or place a `device-*` -log under `outputs/` close in time to the `l2_perf_records.json` +log under `outputs/` close in time to the `l2_swimlane_records.json` mtime; see `simpler_setup/tools/README.md` for the resolver rules. diff --git a/docs/dfx/pmu-profiling.md b/docs/dfx/pmu-profiling.md index 12f091e1d..8f86ddfb7 100644 --- a/docs/dfx/pmu-profiling.md +++ b/docs/dfx/pmu-profiling.md @@ -301,7 +301,7 @@ shared-memory layout, an `init()` that allocates and pre-fills the free queues, an `on_buffer_collected()` callback that appends records to the CSV, and `reconcile_counters()` / `finalize()`. The mgmt/poll threading, buffer pooling, and `Module` trait pattern are shared with TensorDump -and L2Perf — see [profiling-framework.md](../profiling-framework.md) for +and L2Swimlane — see [profiling-framework.md](../profiling-framework.md) for the framework reference. ### 5.3 a5 — same framework, host-shadow transport (DAV_3510, 10 counters) diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md index 60f3bd8ba..c97b8b3d7 100644 --- a/docs/dfx/tensor-dump.md +++ b/docs/dfx/tensor-dump.md @@ -432,7 +432,7 @@ allocates and pre-fills free queues, an `on_buffer_collected` callback that gathers payload bytes into the in-memory record list, plus `reconcile_counters` / `export_dump_files` / `finalize`. The mgmt/poll threading, buffer pooling, and `Module` -trait pattern are shared with PMU and L2Perf — see +trait pattern are shared with PMU and L2Swimlane — see [profiling-framework.md](../profiling-framework.md) for the framework reference. diff --git a/docs/hardware/cache-coherency.md b/docs/hardware/cache-coherency.md index 80a451351..8605519e9 100644 --- a/docs/hardware/cache-coherency.md +++ b/docs/hardware/cache-coherency.md @@ -80,19 +80,19 @@ Two separate concerns, often conflated: stale value from a previous round). The AICPU side must emit `rmb()` between the COND check and the slot reads. -Concretely, the L2 perf staging-slot read in -`src/{a2a3,a5}/platform/src/aicpu/l2_perf_collector_aicpu.cpp` does +Concretely, the L2 swimlane staging-slot read in +`src/{a2a3,a5}/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp` does **not** call `cache_invalidate_range` on the slot, but it **does** call `rmb()` before reading `slot->task_id` and the timing fields. All of those fields are AICore writes covered by the AICore-side `dcci` in -`l2_perf_aicore_record_task`. The same pattern applies to the PMU +`l2_swimlane_aicore_record_task`. The same pattern applies to the PMU staging slot (`src/{a2a3,a5}/platform/src/aicpu/pmu_collector_aicpu.cpp`). ### Historical pitfall PR #540 (2026-04-15) added `cache_invalidate_range(slot, 64)` on the -AICPU side of the L2 perf staging slot, mirroring the +AICPU side of the L2 swimlane staging slot, mirroring the host-DMA-protocol pattern from PR #204. The two situations are **not** the same: host DMA bypasses the AICPU cache; AICore stores plus `dcci` do not. The cache invalidate was redundant — but the @@ -171,11 +171,11 @@ forever once they ship. - `src/{a2a3,a5}/platform/onboard/aicpu/cache_ops.cpp` — `cache_invalidate_range` implementation (`dc civac` / `dsb sy` / `isb`). - `src/{a2a3,a5}/platform/sim/aicpu/cache_ops.cpp` — sim no-op. -- AICore-side `dcci` usage lives in the L2 perf / PMU AICore collectors and any kernel that publishes to a GM slot AICPU reads. +- AICore-side `dcci` usage lives in the L2 swimlane / PMU AICore collectors and any kernel that publishes to a GM slot AICPU reads. ## Related docs - [PMU staging-slot ordering](../dfx/pmu-profiling.md) — detailed AICore-side `dcci` + barrier order for staging-slot writes. - [L2 swimlane profiling](../dfx/l2-swimlane-profiling.md) — - the consumer of the rules above on the L2 perf path. + the consumer of the rules above on the L2 swimlane path. diff --git a/docs/profiling-framework.md b/docs/profiling-framework.md index 54ca1a2ec..2f3d61682 100644 --- a/docs/profiling-framework.md +++ b/docs/profiling-framework.md @@ -1,6 +1,6 @@ # Profiling Framework -Shared host-side infrastructure that the PMU, L2Perf, and TensorDump +Shared host-side infrastructure that the PMU, L2Swimlane, and TensorDump collectors are built on. Each architecture maintains its own copy of the framework headers under `src//platform/include/host/profiling_common/` ([a2a3](../src/a2a3/platform/include/host/profiling_common/), @@ -25,7 +25,7 @@ Each profiling subsystem on a2a3 needs the same plumbing on the host: - A collector thread that drains the host-side hand-off queue and copies records out of each ready buffer. - A pool of pre-registered device buffers (allocated up-front, refilled on - demand) keyed by "kind" — PMU has 1 kind, TensorDump has 1, L2Perf has 2 + demand) keyed by "kind" — PMU has 1 kind, TensorDump has 1, L2Swimlane has 2 (perf records + phase markers). - A dev↔host pointer map so the management thread can resolve a device pointer popped off a ready queue to the host-mapped pointer the collector @@ -40,7 +40,7 @@ a small per-subsystem trait. ```text ┌──────────────────────────────────────────┐ - │ PmuCollector / L2PerfCollector / │ Derived (CRTP) + │ PmuCollector / L2SwimlaneCollector / │ Derived (CRTP) │ TensorDumpCollector │ ─ on_buffer_collected └─────────────┬────────────────────────────┘ ─ kIdleTimeoutSec / kSubsystemName │ public ProfilerBase @@ -58,7 +58,7 @@ a small per-subsystem trait. ▲ │ Module trait wires layout into algorithms ┌───────────────┴────────────────┐ - │ PmuModule / L2PerfModule / │ Pure static trait (no state) + │ PmuModule / L2SwimlaneModule / │ Pure static trait (no state) │ DumpModule │ ─ DataHeader / ReadyEntry / FreeQueue └────────────────────────────────┘ ─ kBufferKinds / kReadyQueueSize ─ resolve_entry / for_each_instance @@ -129,7 +129,7 @@ is where the unified algorithms live: ### 3.3 `Module` — trait layer -A stateless `struct` per subsystem (`PmuModule`, `L2PerfModule`, +A stateless `struct` per subsystem (`PmuModule`, `L2SwimlaneModule`, `DumpModule`) that tells the generic algorithms what the shared-memory layout looks like. The contract lives in the docblock at the top of [`profiler_base.h`](../src/a2a3/platform/include/host/profiling_common/profiler_base.h); @@ -138,7 +138,7 @@ the required members are: | Member | Purpose | | ------ | ------- | | `using DataHeader / ReadyEntry / ReadyBufferInfo / FreeQueue` | Layout types | -| `kBufferKinds` (PMU=1, Dump=1, L2Perf=2) | Number of per-kind recycled pools | +| `kBufferKinds` (PMU=1, Dump=1, L2Swimlane=2) | Number of per-kind recycled pools | | `kReadyQueueSize`, `kSlotCount` | AICPU ready queue / free queue depth | | `kSubsystemName` | Tag used in framework log lines | | `header_from_shm(void*) → DataHeader*` | Cast shared-memory base to header | @@ -149,7 +149,7 @@ the required members are: The Module structs are defined alongside their collectors in [pmu_collector.h](../src/a2a3/platform/include/host/pmu_collector.h), -[l2_perf_collector.h](../src/a2a3/platform/include/host/l2_perf_collector.h), +[l2_swimlane_collector.h](../src/a2a3/platform/include/host/l2_swimlane_collector.h), and [tensor_dump_collector.h](../src/a2a3/platform/include/host/tensor_dump_collector.h) — each is a few dozen lines of static methods over the subsystem's own `DataHeader` / ringbuffer types. @@ -168,7 +168,7 @@ and only has to provide: the collector loop. Use the subsystem's `PLATFORM_*_TIMEOUT_SECONDS` constant. - `static constexpr const char* kSubsystemName` — appears in the idle - timeout log line (e.g. `"PMU"`, `"L2Perf"`, `"TensorDump"`). + timeout log line (e.g. `"PMU"`, `"L2Swimlane"`, `"TensorDump"`). - `init(...)` and `finalize(...)` — domain-specific setup/teardown. `init` must call `set_memory_context()` on the success path so `start(tf)` is not a no-op. `finalize` must release framework-owned @@ -297,7 +297,7 @@ Existing collectors are the canonical examples: — single kind, per-core instances. See [pmu-profiling.md](dfx/pmu-profiling.md). - [`TensorDumpCollector`](../src/a2a3/platform/include/host/tensor_dump_collector.h) — single kind, per-AICPU-thread instances. See [tensor-dump.md](dfx/tensor-dump.md). -- [`L2PerfCollector`](../src/a2a3/platform/include/host/l2_perf_collector.h) +- [`L2SwimlaneCollector`](../src/a2a3/platform/include/host/l2_swimlane_collector.h) — two kinds (perf records + phase markers), per-core / per-thread instances; the canonical multi-kind example. See [l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md). @@ -332,8 +332,8 @@ changes capture that: **not** called from the mgmt loop — it would race with AICPU writes to device-only fields (`current_buf_ptr`, `total/dropped/mismatch` counters, `queue_tails`, `free_queue.head`, - `AicpuPhaseHeader::magic`, `core_to_thread[]`), rolling them back - to whatever the host shadow had at the start of the tick. Per-buffer payloads (`L2PerfBuffer` / `PmuBuffer` / + `L2SwimlaneAicpuPhaseHeader::magic`, `core_to_thread[]`), rolling them back + to whatever the host shadow had at the start of the tick. Per-buffer payloads (`L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` / `DumpMetaBuffer`) are still pulled on demand inside `ProfilerAlgorithms::process_entry` after resolving the host pointer for a popped ready entry. The bulk `mirror_shm_to_device` is kept @@ -363,7 +363,7 @@ per-core ring/reg addresses travel through `KernelArgs`: | `KernelArgs` field | Producer | Consumer | | ------------------ | -------- | -------- | | `enable_profiling_flag` (bitmask) | host (DeviceRunner) | AICPU `kernel.cpp` → `set_l2_swimlane_enabled` / `set_pmu_enabled` / `set_dump_tensor_enabled`; AICore `KERNEL_ENTRY` → `set_aicore_profiling_flag` | -| `aicore_l2_perf_ring_addrs` (table) | host (`L2PerfCollector::initialize`) | AICore `KERNEL_ENTRY` indexes `table[block_idx]` → `set_aicore_l2_perf_ring` | +| `aicore_l2_swimlane_ring_addrs` (table) | host (`L2SwimlaneCollector::initialize`) | AICore `KERNEL_ENTRY` indexes `table[block_idx]` → `set_aicore_l2_swimlane_ring` | | `aicore_pmu_ring_addrs` (table) | host (`PmuCollector::init`) | AICore `KERNEL_ENTRY` → `set_aicore_pmu_ring` | | `regs` (per-physical-core register-base table) | host (already required for AICPU MMIO) | AICore `KERNEL_ENTRY` resolves `regs[get_physical_core_id()]` → `set_aicore_pmu_reg_base`; AICore `aicore_execute` caches the value at Phase-3 | @@ -376,16 +376,16 @@ state surface, never the runtime protocol. ### 8.2 Stable AICore staging ring (decouples AICore write from AICPU buffer rotation) -L2Perf and PMU on a5 both use the "AICore writes, AICPU commits" model. +L2Swimlane and PMU on a5 both use the "AICore writes, AICPU commits" model. The AICore-side write target is a per-core -[`L2PerfAicoreRing`](../src/a5/platform/include/common/l2_perf_profiling.h) / +[`L2SwimlaneAicoreRing`](../src/a5/platform/include/common/l2_swimlane_profiling.h) / [`PmuAicoreRing`](../src/a5/platform/include/common/pmu_profiling.h) of `PLATFORM_{L2,PMU}_AICORE_RING_SIZE` (= 2, dual-issue) slots, allocated once by the host and addressed by `BufferState::aicore_ring_ptr` (AICPU-visible) and the per-core `aicore_*_ring_addrs[block_idx]` (AICore-visible). The address is never reassigned, so AICore's write target is stable across AICPU's -rotating `L2PerfBuffer` / `PmuBuffer` flips — flipping is now +rotating `L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` flips — flipping is now fully internal to `*_complete_record` and never crosses into Handshake. Everything else — Module concept contract, alloc policy diff --git a/docs/profiling-name-map.md b/docs/profiling-name-map.md index 105ba2fee..1fff59e9c 100644 --- a/docs/profiling-name-map.md +++ b/docs/profiling-name-map.md @@ -2,7 +2,7 @@ ## Problem -Profiling data (`l2_perf_records.json`) identifies tasks by numeric IDs +Profiling data (`l2_swimlane_records.json`) identifies tasks by numeric IDs (e.g., `func_id: 0`). Without a mapping, swimlane visualizations show opaque labels like `func_0_a(t0)` instead of human-readable names like `QK(t0)`. @@ -45,7 +45,7 @@ Every level uses the same structure: ### L2 (Orchestration + Incores) `callable_id` = incore `func_id` (the integer assigned in the CALLABLE -spec). These are the same IDs that appear in L2 perf data. +spec). These are the same IDs that appear in L2 swimlane data. ```json { @@ -147,10 +147,10 @@ takes precedence over `-k` (kernel_config.py): # Automatic (via SceneTest profiling) pytest tests/st/... --platform a5onboard --enable-l2-swimlane -# Manual (paths land alongside l2_perf_records.json inside the same +# Manual (paths land alongside l2_swimlane_records.json inside the same # directory) python -m simpler_setup.tools.swimlane_converter \ - outputs/_/l2_perf_records.json \ + outputs/_/l2_swimlane_records.json \ --func-names outputs/_/name_map_TestPA_basic.json python -m simpler_setup.tools.deps_to_graph \ @@ -169,7 +169,7 @@ cannot collide. ```text outputs/TestPA_basic_20260416_151301/ - l2_perf_records.json # perf data (runtime) + l2_swimlane_records.json # perf data (runtime) name_map_TestPA_basic.json # name mapping (SceneTest) merged_swimlane.json # Perfetto trace (converter) ``` diff --git a/docs/sim_multi_device_isolation.md b/docs/sim_multi_device_isolation.md index 09247e0a6..46c5b5d49 100644 --- a/docs/sim_multi_device_isolation.md +++ b/docs/sim_multi_device_isolation.md @@ -24,7 +24,7 @@ Communication uses a 4096-byte shared-memory mailbox per chip — the same layou ## Why Not Fix the Globals -The global state in `host_runtime.so` spans multiple files (`cpu_sim_context.cpp`, `platform_aicpu_affinity.cpp`, `l2_perf_collector_aicpu.cpp`, `device_log.cpp`) and is deeply embedded in the AICPU/AICore thread model. Fixing each one individually is fragile. Process isolation solves all of them at once with zero platform code changes. +The global state in `host_runtime.so` spans multiple files (`cpu_sim_context.cpp`, `platform_aicpu_affinity.cpp`, `l2_swimlane_collector_aicpu.cpp`, `device_log.cpp`) and is deeply embedded in the AICPU/AICore thread model. Fixing each one individually is fragile. Process isolation solves all of them at once with zero platform code changes. ## Files diff --git a/docs/testing.md b/docs/testing.md index c7c9fd735..68f4f9888 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -104,7 +104,7 @@ python test_xxx.py -p a2a3sim --log-level debug # verbose C++ l | `--case SEL` | | (all) | Case selector, repeatable: `Foo`, `ClassA::Foo`, `ClassA::` | | `--manual` | | `exclude` | `exclude`/`include`/`only` for manual cases | | `--skip-golden` | | false | Skip golden comparison (for benchmarking) | -| `--enable-l2-swimlane [PERF_LEVEL]` | | `0` | Enable L2 swimlane collection on first round only. The flag takes an integer perf_level 0–4 (bare = 4); see [docs/dfx/l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md#31-enable-l2-swimlane) for the level table. Each test case gets its own `outputs/_/` directory under which `l2_perf_records.json` lands; parallel runs never collide. | +| `--enable-l2-swimlane [PERF_LEVEL]` | | `0` | Enable L2 swimlane collection on first round only. The flag takes an integer perf_level 0–4 (bare = 4); see [docs/dfx/l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md#31-enable-l2-swimlane) for the level table. Each test case gets its own `outputs/_/` directory under which `l2_swimlane_records.json` lands; parallel runs never collide. | | `--dump-tensor` | | false | Dump per-task tensor I/O during runtime execution | | `--enable-pmu [EVENT_TYPE]` | | `0` | Enable a2a3 PMU CSV collection. Bare flag selects `PIPE_UTILIZATION` (`2`); pass an event type such as `4` for `MEMORY`. | | `--exitfirst` | `-x` | false | Stop on first failing test (fail-fast, primarily for CI) | @@ -318,13 +318,13 @@ A single file can declare both L2 and L3 classes; they're grouped by `(runtime, Each test case sets its own `CallConfig.output_prefix` (chosen by `scene_test.py::_build_output_prefix` as `outputs/__/`). The C++ runtime writes all diagnostic artifacts under that prefix with fixed filenames: -- `outputs/_/l2_perf_records.json` — swimlane (`--enable-l2-swimlane`) +- `outputs/_/l2_swimlane_records.json` — swimlane (`--enable-l2-swimlane`) - `outputs/_/tensor_dump/` — tensor dump (`--dump-tensor`) - `outputs/_/pmu.csv` — PMU counters (`--enable-pmu`) Because each case gets its own directory, parallel runs (xdist workers, L3 case fanout, L2 device fanout) can never collide on filename — there is no per-file timestamp, no env-var scoping, and no post-run flatten step. `CallConfig::validate()` throws if any diagnostic flag is enabled but `output_prefix` is empty; `scene_test.py::run_class_cases` always fills it from the case label. -Standalone invocations of CLIs (`python -m simpler_setup.tools.swimlane_converter`, etc.) auto-detect the latest `outputs/*/l2_perf_records.json` (sorted by mtime); pass `--input ` to override. +Standalone invocations of CLIs (`python -m simpler_setup.tools.swimlane_converter`, etc.) auto-detect the latest `outputs/*/l2_swimlane_records.json` (sorted by mtime); pass `--input ` to override. ### Dispatcher skip conditions (normal pytest runs) diff --git a/examples/workers/l2/vector_add/test_run_timing.py b/examples/workers/l2/vector_add/test_run_timing.py index a3944c087..2624c4173 100644 --- a/examples/workers/l2/vector_add/test_run_timing.py +++ b/examples/workers/l2/vector_add/test_run_timing.py @@ -100,7 +100,7 @@ def test_worker_run_returns_run_timing(st_platform, st_device_ids): # device_wall must also be > 0 without --enable-l2-swimlane after the # Phase B decoupling: orch_summary is written unconditionally when # PTO2_PROFILING is on (default build). Hitting 0 here means either: - # - the AICPU's l2_perf_aicpu_write_orch_summary path regressed back + # - the AICPU's l2_swimlane_aicpu_write_orch_summary path regressed back # under an is_l2_swimlane_enabled() gate, or # - the host stopped reading the phase header after the run. assert timing.device_wall_us > 0.0, ( diff --git a/scope_stats/scope_stats.jsonl b/scope_stats/scope_stats.jsonl new file mode 100644 index 000000000..7ce6caf3f --- /dev/null +++ b/scope_stats/scope_stats.jsonl @@ -0,0 +1,3 @@ +{"version": 4, "fatal": false, "dropped": 0, "total": 2, "task_window_max": [16384, 16384, 16384, 16384], "heap_max": [268435456, 268435456, 268435456, 268435456], "tensormap_max": 65536} +{"site": "(unknown):0", "phase": "begin", "depth": 0, "ring": 0, "task_window_start": 0, "task_window_end": 0, "heap_start": 0, "heap_end": 0, "tensormap": 0} +{"site": "(unknown):0", "phase": "end", "depth": 0, "ring": 0, "task_window_start": 0, "task_window_end": 1, "heap_start": 0, "heap_end": 0, "tensormap": 1} diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py index 29c1c1f16..39f4891a9 100644 --- a/simpler_setup/scene_test.py +++ b/simpler_setup/scene_test.py @@ -558,7 +558,7 @@ def _build_output_prefix(case_label: str) -> Path: """Per-case directory for diagnostic artifacts. Each case gets its own ``outputs/_/`` directory; the - runtime writes ``l2_perf_records.json``, ``tensor_dump/``, and ``pmu.csv`` + runtime writes ``l2_swimlane_records.json``, ``tensor_dump/``, and ``pmu.csv`` under that root with fixed filenames. Two cases of the same name run in the same second is not a contemplated scenario (parallel xdist runs differ by class+method). @@ -584,7 +584,7 @@ def _run_swimlane_converter( When ``input_path`` is given, the converter derives its output filename from the input's timestamp (see ``swimlane_converter._resolve_output_path``). - Without it, the converter auto-selects the latest ``l2_perf_records_*.json``. + Without it, the converter auto-selects the latest ``l2_swimlane_records_*.json``. """ import logging # noqa: PLC0415 import subprocess # noqa: PLC0415 @@ -618,13 +618,13 @@ def _convert_case_swimlane( callable_spec: dict | None = None, ) -> None: """Post-case: invoke the swimlane converter on the perf file the runtime - just wrote into ``/l2_perf_records.json``. No diff/rename + just wrote into ``/l2_swimlane_records.json``. No diff/rename dance — the path is known a priori from CallConfig.output_prefix. """ import logging # noqa: PLC0415 logger = logging.getLogger(__name__) - perf_file = output_prefix / "l2_perf_records.json" + perf_file = output_prefix / "l2_swimlane_records.json" if not perf_file.exists(): logger.warning(f"[{case_label}] {perf_file} not produced; skipping conversion") return @@ -693,7 +693,7 @@ def run_class_cases( # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI # Per-case directory the runtime writes into. Required (non-empty) when # any diagnostic flag is on; CallConfig::validate() throws otherwise. # scope_stats now writes /scope_stats/scope_stats.jsonl (sibling of - # l2_perf_records.json / deps.json), so it pulls output_prefix the + # l2_swimlane_records.json / deps.json), so it pulls output_prefix the # same way the other DFX flags do. prefix = _build_output_prefix(case_label) if diagnostics_on else Path("") try: diff --git a/simpler_setup/tools/README.md b/simpler_setup/tools/README.md index a1a548440..d2e42530d 100644 --- a/simpler_setup/tools/README.md +++ b/simpler_setup/tools/README.md @@ -14,7 +14,7 @@ no repo checkout required. - **[deps_to_graph](#deps_to_graph)** — `deps.json` (dep_gen) → pan/zoom HTML dependency graph - **[dump_viewer](#dump_viewer)** — inspect / export tensor dumps (see [docs/tensor-dump.md](../../docs/dfx/tensor-dump.md) for full workflow) -Auto-detection paths (`outputs/*/l2_perf_records.json`, `outputs/*/tensor_dump/`) +Auto-detection paths (`outputs/*/l2_swimlane_records.json`, `outputs/*/tensor_dump/`) are resolved relative to the **current working directory** — run these from the directory that holds your `outputs/`. Each test case writes into its own `outputs/_/` directory; the tools auto-pick the latest by mtime. @@ -27,7 +27,7 @@ Convert performance profiling JSON files into Chrome Trace Event format for visu ### Overview -Converts PTO Runtime profiling data (`l2_perf_records_*.json`) into the format used by the Perfetto trace viewer (). It also produces a task execution statistics summary grouped by function and a scheduler overhead deep-dive report (the same one `sched_overhead_analysis` emits). +Converts PTO Runtime profiling data (`l2_swimlane_records_*.json`) into the format used by the Perfetto trace viewer (). It also produces a task execution statistics summary grouped by function and a scheduler overhead deep-dive report (the same one `sched_overhead_analysis` emits). ### Basic Usage @@ -36,20 +36,20 @@ Converts PTO Runtime profiling data (`l2_perf_records_*.json`) into the format u python -m simpler_setup.tools.swimlane_converter # Specify an input file -python -m simpler_setup.tools.swimlane_converter outputs/_/l2_perf_records.json +python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json # Specify an output file -python -m simpler_setup.tools.swimlane_converter outputs/_/l2_perf_records.json -o custom_output.json +python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json -o custom_output.json # Load function name mapping from kernel_config.py -python -m simpler_setup.tools.swimlane_converter outputs/_/l2_perf_records.json \ +python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json \ -k examples/host_build_graph/paged_attention/kernels/kernel_config.py # Verbose mode (for debugging) -python -m simpler_setup.tools.swimlane_converter outputs/_/l2_perf_records.json -v +python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json -v # Reuse a deps.json captured in an earlier dep_gen run (different output dir) -python -m simpler_setup.tools.swimlane_converter outputs/_/l2_perf_records.json \ +python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json \ --deps-json outputs/_/deps.json ``` @@ -65,7 +65,7 @@ python -m simpler_setup.tools.swimlane_converter outputs/_/l2_perf_rec | Option | Short | Description | | ------ | ----- | ----------- | -| `input` | | Input JSON file (l2_perf_records_*.json). If omitted, the latest file in outputs/ is used | +| `input` | | Input JSON file (l2_swimlane_records_*.json). If omitted, the latest file in outputs/ is used | | `--output` | `-o` | Output JSON file (default: outputs/merged_swimlane_``.json) | | `--kernel-config` | `-k` | Path to kernel_config.py, used for function name mapping | | `--func-names` | | Path to func_id_names_*.json (SceneTest format) for function name mapping | @@ -118,7 +118,7 @@ python examples/scripts/run_example.py \ After the test passes, the tool will: -1. Auto-detect the latest `l2_perf_records_*.json` in outputs/ +1. Auto-detect the latest `l2_swimlane_records_*.json` in outputs/ 2. Load function names from the kernel_config.py specified via `-k` 3. Produce `merged_swimlane_*.json` for visualization 4. Print the task statistics and scheduler overhead deep-dive report to the console @@ -133,7 +133,7 @@ Analyze AICPU scheduler overhead and quantitatively decompose the sources of Tai `sched_overhead_analysis` reads two artifacts produced by the runtime: -1. **Perf profiling data** (`l2_perf_records_*.json`, l2_perf_level >= 3): per-task Exec / Head OH / Tail OH time breakdowns plus `aicpu_scheduler_phases` — per-thread, per-loop-iteration phase records carrying scan / complete / dispatch / idle timings and per-emit pop_hit / pop_miss deltas. +1. **Perf profiling data** (`l2_swimlane_records_*.json`, l2_swimlane_level >= 3): per-task Exec / Head OH / Tail OH time breakdowns plus `aicpu_scheduler_phases` — per-thread, per-loop-iteration phase records carrying scan / complete / dispatch / idle timings and per-emit pop_hit / pop_miss deltas. 2. **`deps.json`** (optional, dep_gen replay output): structural task DAG. When colocated with the perf JSON, Part 2 prints per-thread fanout / fanin aggregates derived from it. ### Basic Usage @@ -144,11 +144,11 @@ python -m simpler_setup.tools.sched_overhead_analysis # Specify the perf JSON explicitly python -m simpler_setup.tools.sched_overhead_analysis \ - --l2-perf-records-json outputs/_/l2_perf_records.json + --l2-swimlane-records-json outputs/_/l2_swimlane_records.json # Override the deps.json location python -m simpler_setup.tools.sched_overhead_analysis \ - --l2-perf-records-json outputs/_/l2_perf_records.json \ + --l2-swimlane-records-json outputs/_/l2_swimlane_records.json \ --deps-json outputs/_/deps.json ``` @@ -156,7 +156,7 @@ python -m simpler_setup.tools.sched_overhead_analysis \ | Option | Description | | ------ | ----------- | -| `--l2-perf-records-json` | Path to the l2_perf_records_*.json file. If omitted, the latest file in outputs/ is auto-selected | +| `--l2-swimlane-records-json` | Path to the l2_swimlane_records_*.json file. If omitted, the latest file in outputs/ is auto-selected | | `--deps-json` | Path to deps.json (dep_gen replay output) for fanout / fanin aggregates. Defaults to the deps.json sibling of the perf JSON. | ### Outputs @@ -167,7 +167,7 @@ Output is emitted in three parts: - **Part 2: AICPU scheduler loop breakdown** — per-scheduler-thread loop statistics, per-phase (scan / complete / dispatch / idle) time ratios, pop_hit / pop_miss totals, and (when deps.json is available) per-thread fanout / fanin aggregates - **Part 3: Tail OH distribution & cause analysis** — Tail OH quantile distribution (P10–P99), correlation between scheduler loop iteration time and Tail OH, and data-driven insights into the dominant phase -The perf JSON must be captured at l2_perf_level >= 3 so that `aicpu_scheduler_phases` is non-empty (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing). +The perf JSON must be captured at l2_swimlane_level >= 3 so that `aicpu_scheduler_phases` is non-empty (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing). --- @@ -279,11 +279,11 @@ python -m simpler_setup.tools.dump_viewer outputs/_/tensor_dump/ --ind ### Input File Format -The analysis tools share the same input format - the `l2_perf_records_*.json` files generated by the PTO Runtime: +The analysis tools share the same input format - the `l2_swimlane_records_*.json` files generated by the PTO Runtime: ```json { - "l2_perf_level": 4, + "l2_swimlane_level": 4, "tasks": [ { "task_id": 0, @@ -320,9 +320,9 @@ The analysis tools share the same input format - the `l2_perf_records_*.json` fi Dependency edges come from `deps.json` (dep_gen replay) at post-process time — not from the perf JSON. See [`swimlane_converter --deps-json`](#swimlane_converter). -Top-level layout depends on `l2_perf_level`: +Top-level layout depends on `l2_swimlane_level`: -- All levels: `l2_perf_level`, `tasks[]` (per-task fields above). +- All levels: `l2_swimlane_level`, `tasks[]` (per-task fields above). - `>= 3`: also `aicpu_scheduler_phases[]` (per-thread phase records: scan / complete / dispatch / idle) and `core_to_thread[]` (core_id → scheduler thread index). @@ -398,7 +398,7 @@ For batch-run hardware regression, see the dev-only script ## Troubleshooting -### Error: cannot find l2_perf_records_*.json file +### Error: cannot find l2_swimlane_records_*.json file - Make sure the test was run with the `--enable-l2-swimlane` flag - Check that the outputs/ directory exists and contains profiling data @@ -408,18 +408,18 @@ For batch-run hardware regression, see the dev-only script - Check the kernel_config.py file format - Make sure every KERNELS entry has a 'func_id' and 'name' field -### Error: Unsupported l2_perf_level +### Error: Unsupported l2_swimlane_level -- The tools accept l2_perf_level 1–4 (the integer captured at runtime +- The tools accept l2_swimlane_level 1–4 (the integer captured at runtime via `--enable-l2-swimlane `) - Regenerate the profiling data with a supported level ### Error: Perf JSON missing required fields for scheduler overhead analysis -- This error means the input `l2_perf_records_*.json` lacks fields required by the deep-dive analysis (typically `dispatch_time_us` / `finish_time_us`) +- This error means the input `l2_swimlane_records_*.json` lacks fields required by the deep-dive analysis (typically `dispatch_time_us` / `finish_time_us`) - The basic conversion in `swimlane_converter` can still succeed, but the deep-dive will be skipped or fail - Remediation: - 1. Re-run with `--enable-l2-swimlane` to produce a new `outputs/*/l2_perf_records.json` + 1. Re-run with `--enable-l2-swimlane` to produce a new `outputs/*/l2_swimlane_records.json` 2. Re-run `swimlane_converter` or `sched_overhead_analysis` 3. Verify that each task in the JSON contains `dispatch_time_us` and `finish_time_us` @@ -435,7 +435,7 @@ For batch-run hardware regression, see the dev-only script | File | Tool | Purpose | Format | | ---- | ---- | ------- | ------ | -| `l2_perf_records_*.json` | Runtime | Raw timing profiling data | JSON | +| `l2_swimlane_records_*.json` | Runtime | Raw timing profiling data | JSON | | `merged_swimlane_*.json` | swimlane_converter | Perfetto visualization | Chrome Trace Event JSON | | `deps.json` | Runtime (dep_gen replay) | Structural task dependency graph + per-edge tensor info | JSON | | `deps_graph.html` | deps_to_graph | Pan/zoom dependency graph viewer | HTML (self-contained) | diff --git a/simpler_setup/tools/deps_to_graph.py b/simpler_setup/tools/deps_to_graph.py index ec185e6ce..cba7356c6 100644 --- a/simpler_setup/tools/deps_to_graph.py +++ b/simpler_setup/tools/deps_to_graph.py @@ -21,7 +21,7 @@ gotcha is that high zoom slightly blurs text — that's a CSS-transform tradeoff in exchange for 60fps GPU-composited pan/zoom even on huge graphs. -When ``l2_perf_records.json`` is colocated with ``deps.json``, node labels are +When ``l2_swimlane_records.json`` is colocated with ``deps.json``, node labels are enriched with the per-task ``func_id`` and ``core_type`` so a node reads as ``t12 · kernel_mul · aiv`` rather than just ``t12``; nodes are colored by core_type (AIC blue, AIV orange). @@ -44,7 +44,7 @@ def _normalize_task_id(v): - """Unsigned 64-bit task id (matches deps.json edges and l2_perf task_id). + """Unsigned 64-bit task id (matches deps.json edges and l2_swimlane task_id). Accepts ints (legacy) and strings (current schema): deps.json emits all uint64 fields as quoted strings to dodge JSON-number precision loss in @@ -239,7 +239,7 @@ def _backfill_output_tensor_ids(task_table, annotations): def _load_task_meta(deps_path, func_names=None): - """Optional l2_perf_records.json sidecar → {task_id: {'func_id', 'core_type', ...}}. + """Optional l2_swimlane_records.json sidecar → {task_id: {'func_id', 'core_type', ...}}. Mixed-kernel tasks (single submit_task that spans both AIC and AIV blocks) appear as multiple perf-record entries with the same ``task_id`` but @@ -252,7 +252,7 @@ def _load_task_meta(deps_path, func_names=None): Returns {} if no sidecar present. ``func_names`` (optional dict) overrides the default ``f{func_id}`` label with a human name. """ - perf_path = Path(deps_path).parent / "l2_perf_records.json" + perf_path = Path(deps_path).parent / "l2_swimlane_records.json" if not perf_path.exists(): return {} try: @@ -318,7 +318,7 @@ def _label(task_id, meta, fmt_task, have_perf=False): # is an ellipse; "mix" (single submit_task spanning both core types) is a # diamond; "alloc" — a task that came from ``alloc_tensors`` (got a real # task_id and shows up as a producer in deps via ``owner_task_id``, but -# never dispatched a kernel so no l2_perf record and no func_id) — is a +# never dispatched a kernel so no l2_swimlane record and no func_id) — is a # dashed gray note. Distinct shape AND color so each stays readable even # without color (B&W print, accessibility, etc.). _CORE_STYLE = { diff --git a/simpler_setup/tools/sched_overhead_analysis.py b/simpler_setup/tools/sched_overhead_analysis.py index 53ad97fd8..6f7579ac8 100644 --- a/simpler_setup/tools/sched_overhead_analysis.py +++ b/simpler_setup/tools/sched_overhead_analysis.py @@ -10,7 +10,7 @@ """Scheduler overhead analysis for PTO2. Inputs: - 1. Per-task perf profiling data (l2_perf_records_*.json) with + 1. Per-task perf profiling data (l2_swimlane_records_*.json) with ``aicpu_scheduler_phases`` populated by ``--enable-l2-swimlane`` at level >= 3. 2. deps.json (optional, dep_gen replay output) colocated with the perf JSON, @@ -18,8 +18,8 @@ Usage: python -m simpler_setup.tools.sched_overhead_analysis # auto-select latest files - python -m simpler_setup.tools.sched_overhead_analysis --l2-perf-records-json - python -m simpler_setup.tools.sched_overhead_analysis --l2-perf-records-json --deps-json + python -m simpler_setup.tools.sched_overhead_analysis --l2-swimlane-records-json + python -m simpler_setup.tools.sched_overhead_analysis --l2-swimlane-records-json --deps-json """ import argparse @@ -48,7 +48,7 @@ def compute_dag_stats_from_deps(deps_data, perf_data, threads): Why this lives in Python and not the runtime: the DAG edge set is already captured structurally by dep_gen (deps.json), and the per-task → scheduler- - thread map is in ``l2_perf_records.json::core_to_thread``. Re-instrumenting + thread map is in ``l2_swimlane_records.json::core_to_thread``. Re-instrumenting the AICPU to track fanout edge counts is duplicate work; running this in Python over the existing artifacts is cheaper, more accurate (deps.json captures #599 race-window edges that fanout[] dropped), and lets the @@ -108,7 +108,7 @@ def task_thread(task): per_thread_fanin = defaultdict(lambda: {"edges": 0, "max": 0, "tasks": 0}) # Dedup by task_id: mixed (AIC+AIV) tasks emit one perf row per subtask / - # core (see l2_perf_collector.cpp:567 — collected_perf_records_ is keyed by + # core (see l2_swimlane_collector.cpp:567 — collected_perf_records_ is keyed by # core_idx). Without dedup a mixed task's fanout would be charged once per # subtask, inflating per-thread edge counts by the subtask count. seen_task_ids = set() @@ -145,20 +145,20 @@ def task_thread(task): t["fanin_max_degree"] = fi["max"] -def auto_select_l2_perf_records_json(): - """Find the latest outputs//l2_perf_records.json (sorted by mtime).""" +def auto_select_l2_swimlane_records_json(): + """Find the latest outputs//l2_swimlane_records.json (sorted by mtime).""" outputs_dir = Path.cwd() / "outputs" - files = sorted(outputs_dir.glob("*/l2_perf_records.json"), key=lambda p: p.stat().st_mtime, reverse=True) + files = sorted(outputs_dir.glob("*/l2_swimlane_records.json"), key=lambda p: p.stat().st_mtime, reverse=True) if not files: - raise FileNotFoundError(f"No outputs/*/l2_perf_records.json found under {outputs_dir}") + raise FileNotFoundError(f"No outputs/*/l2_swimlane_records.json found under {outputs_dir}") return files[0] def parse_scheduler_from_json_phases(data): - """Extract scheduler Phase breakdown from l2_perf_records JSON. + """Extract scheduler Phase breakdown from l2_swimlane_records JSON. Computes per-thread loop counts, task counts, and phase totals - from aicpu_scheduler_phases records (present at l2_perf_level >= 3). + from aicpu_scheduler_phases records (present at l2_swimlane_level >= 3). Returns: dict: Thread data keyed by thread index, with per-phase us / pct, @@ -279,12 +279,12 @@ def validate_perf_tasks_for_overhead_analysis(tasks): f"Missing required fields (showing up to 5 tasks): {detail}", "", "Why this happens:", - " - The input is not a runtime-generated l2_perf_records_*.json, OR", + " - The input is not a runtime-generated l2_swimlane_records_*.json, OR", " - The runtime binary does not include / emit dispatch+finish timestamps.", "", "How to fix:", " 1) Re-run workload with profiling enabled (e.g. run_example.py --enable-l2-swimlane).", - " 2) Pass the newly generated outputs//l2_perf_records.json via --l2-perf-records-json.", + " 2) Pass the newly generated outputs//l2_swimlane_records.json via --l2-swimlane-records-json.", " 3) Verify each task includes dispatch_time_us and finish_time_us.", "", "Note:", @@ -297,7 +297,7 @@ def validate_perf_tasks_for_overhead_analysis(tasks): def run_analysis( # noqa: PLR0912, PLR0915 - l2_perf_records_path, + l2_swimlane_records_path, print_sources=True, deps_json_path=None, perf_data=None, @@ -305,7 +305,7 @@ def run_analysis( # noqa: PLR0912, PLR0915 """Run scheduler overhead analysis report. Args: - l2_perf_records_path: Path to l2_perf_records_*.json. + l2_swimlane_records_path: Path to l2_swimlane_records_*.json. print_sources: Whether to print selected input files. perf_data: Optional pre-parsed perf JSON dict. When provided, skip re-reading from disk — main() already parses the file to probe @@ -318,20 +318,20 @@ def run_analysis( # noqa: PLR0912, PLR0915 Returns: int: 0 on success, non-zero on failure. """ - l2_perf_records_path = Path(l2_perf_records_path) + l2_swimlane_records_path = Path(l2_swimlane_records_path) - if not l2_perf_records_path.exists(): - print(f"Error: Perf JSON not found: {l2_perf_records_path}", file=sys.stderr) + if not l2_swimlane_records_path.exists(): + print(f"Error: Perf JSON not found: {l2_swimlane_records_path}", file=sys.stderr) return 1 # Auto-discover deps.json sibling when caller didn't specify one. if deps_json_path is None: - sibling = l2_perf_records_path.parent / "deps.json" + sibling = l2_swimlane_records_path.parent / "deps.json" if sibling.exists(): deps_json_path = sibling if print_sources: - print(f"Perf data: {l2_perf_records_path}") + print(f"Perf data: {l2_swimlane_records_path}") if deps_json_path is not None: print(f"Deps JSON: {deps_json_path}") @@ -339,7 +339,7 @@ def run_analysis( # noqa: PLR0912, PLR0915 if perf_data is not None: data = perf_data else: - with open(l2_perf_records_path) as f: + with open(l2_swimlane_records_path) as f: data = json.load(f) tasks = data["tasks"] n_total = len(tasks) @@ -573,13 +573,14 @@ def main(): epilog=""" Examples: %(prog)s # auto-select latest files - %(prog)s --l2-perf-records-json outputs/_/l2_perf_records.json - %(prog)s --l2-perf-records-json outputs/_/l2_perf_records.json --deps-json outputs/_/deps.json + %(prog)s --l2-swimlane-records-json outputs/_/l2_swimlane_records.json + %(prog)s --l2-swimlane-records-json outputs/_/l2_swimlane_records.json \ + --deps-json outputs/_/deps.json """, ) parser.add_argument( - "--l2-perf-records-json", - help="Path to l2_perf_records_*.json file. If not specified, uses the latest in outputs/", + "--l2-swimlane-records-json", + help="Path to l2_swimlane_records_*.json file. If not specified, uses the latest in outputs/", ) parser.add_argument( "--deps-json", @@ -593,30 +594,32 @@ def main(): # Resolve perf path try: - l2_perf_records_path = ( - Path(args.l2_perf_records_json) if args.l2_perf_records_json else auto_select_l2_perf_records_json() + l2_swimlane_records_path = ( + Path(args.l2_swimlane_records_json) + if args.l2_swimlane_records_json + else auto_select_l2_swimlane_records_json() ) except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) return 1 - if not l2_perf_records_path.exists(): - print(f"Error: Perf JSON not found: {l2_perf_records_path}", file=sys.stderr) + if not l2_swimlane_records_path.exists(): + print(f"Error: Perf JSON not found: {l2_swimlane_records_path}", file=sys.stderr) return 1 # Single load — pass the parsed dict to run_analysis() so it doesn't # reread the file (large artifacts hit JSON parsing twice otherwise). try: - with open(l2_perf_records_path) as _f: + with open(l2_swimlane_records_path) as _f: perf_data = json.load(_f) except (OSError, ValueError) as e: - print(f"Error: failed to read perf JSON {l2_perf_records_path}: {e}", file=sys.stderr) + print(f"Error: failed to read perf JSON {l2_swimlane_records_path}: {e}", file=sys.stderr) return 1 deps_json_path = Path(args.deps_json) if args.deps_json else None return run_analysis( - l2_perf_records_path, + l2_swimlane_records_path, print_sources=True, deps_json_path=deps_json_path, perf_data=perf_data, diff --git a/simpler_setup/tools/swimlane_converter.py b/simpler_setup/tools/swimlane_converter.py index 1d7fe2103..e2fd941c5 100644 --- a/simpler_setup/tools/swimlane_converter.py +++ b/simpler_setup/tools/swimlane_converter.py @@ -14,11 +14,11 @@ for visualization in Perfetto (https://ui.perfetto.dev/). Usage: - python -m simpler_setup.tools.swimlane_converter # latest l2_perf_records_*.json under ./outputs/ - python -m simpler_setup.tools.swimlane_converter l2_perf_records_20260210_143526.json - python -m simpler_setup.tools.swimlane_converter l2_perf_records_20260210_143526.json -o out.json - python -m simpler_setup.tools.swimlane_converter l2_perf_records_20260210_143526.json -k kernel_config.py - python -m simpler_setup.tools.swimlane_converter l2_perf_records_20260210_143526.json -v + python -m simpler_setup.tools.swimlane_converter # latest l2_swimlane_records_*.json under ./outputs/ + python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json + python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json -o out.json + python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json -k kernel_config.py + python -m simpler_setup.tools.swimlane_converter outputs/_/l2_swimlane_records.json -v """ import argparse @@ -93,7 +93,7 @@ def read_perf_data(filepath): Returns: dict: Parsed performance data with keys: - - l2_perf_level + - l2_swimlane_level - tasks (list) Raises: @@ -102,13 +102,13 @@ def read_perf_data(filepath): with open(filepath) as f: data = json.load(f) - required_fields = ["l2_perf_level", "tasks"] + required_fields = ["l2_swimlane_level", "tasks"] for field in required_fields: if field not in data: raise ValueError(f"Missing required field: {field}") - if data["l2_perf_level"] not in [1, 2, 3, 4]: - raise ValueError(f"Unsupported l2_perf_level: {data['l2_perf_level']} (expected 1, 2, 3, or 4)") + if data["l2_swimlane_level"] not in [1, 2, 3, 4]: + raise ValueError(f"Unsupported l2_swimlane_level: {data['l2_swimlane_level']} (expected 1, 2, 3, or 4)") return data @@ -393,15 +393,15 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 output_path: Path to output JSON file func_id_to_name: Optional dict mapping func_id to function name verbose: Print progress information - scheduler_phases: Optional list of per-thread phase record lists (l2_perf_level >= 3) - orchestrator_phases: Optional list of per-task orchestrator phase records (l2_perf_level >= 4) + scheduler_phases: Optional list of per-thread phase record lists (l2_swimlane_level >= 3) + orchestrator_phases: Optional list of per-task orchestrator phase records (l2_swimlane_level >= 4) core_to_thread: Optional list mapping core_id (index) to scheduler thread index (-1 = unassigned) Generates processes in the trace: - pid=1 "AICore View": start_time_us to end_time_us (kernel execution) - pid=2 "AICPU View": dispatch_time_us to finish_time_us (AICPU perspective) - - pid=3 "AICPU Scheduler": scheduler phase bars (l2_perf_level >= 3) - - pid=4 "AICPU Orchestrator": orchestrator phase bars or summary (l2_perf_level >= 4) + - pid=3 "AICPU Scheduler": scheduler phase bars (l2_swimlane_level >= 3) + - pid=4 "AICPU Orchestrator": orchestrator phase bars or summary (l2_swimlane_level >= 4) """ if verbose: print("Generating Chrome Trace JSON...") @@ -698,7 +698,7 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 if hb_violation_count > 0: print(f" Happens-before violations: {hb_violation_count} edge(s) flagged as 'hb_violation'") - # AICPU Scheduler phase events (l2_perf_level >= 3) + # AICPU Scheduler phase events (l2_swimlane_level >= 3) if scheduler_phases: # Process metadata events.append( @@ -764,7 +764,7 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 } ) - # AICPU Orchestrator lane (l2_perf_level >= 4) + # AICPU Orchestrator lane (l2_swimlane_level >= 4) # # Per-event AicpuPhaseRecord[] is the single source of truth for # orchestrator timing. There is no separate aggregate summary — the @@ -1094,17 +1094,18 @@ def _build_parser(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - %(prog)s # Use latest .json in outputs/, output to outputs/ - %(prog)s l2_perf_records_20260210_143526.json # Output: outputs/merged_swimlane_20260210_143526.json - %(prog)s l2_perf_records_20260210_143526.json -o custom_output.json - %(prog)s l2_perf_records_20260210_143526.json -k examples/host_build_graph/paged_attention/kernels/kernel_config.py - %(prog)s l2_perf_records_20260210_143526.json -v + %(prog)s # Use latest .json in outputs/, output to outputs/ + %(prog)s outputs/_/l2_swimlane_records.json # Output: outputs/merged_swimlane_20260210_143526.json + %(prog)s outputs/_/l2_swimlane_records.json -o custom_output.json + %(prog)s outputs/_/l2_swimlane_records.json \ + -k examples/host_build_graph/paged_attention/kernels/kernel_config.py + %(prog)s outputs/_/l2_swimlane_records.json -v """, ) parser.add_argument( "input", nargs="?", - help="Input JSON file (.json). If not specified, uses the latest l2_perf_records_*.json in outputs/", + help="Input JSON file (.json). If not specified, uses the latest l2_swimlane_records_*.json in outputs/", ) parser.add_argument("-o", "--output", help="Output JSON file (default: /merged_swimlane.json)") parser.add_argument( @@ -1128,7 +1129,7 @@ def _build_parser(): def _resolve_input_path(args): - """Resolve input path, auto-selecting newest outputs//l2_perf_records.json if unspecified.""" + """Resolve input path, auto-selecting newest outputs//l2_swimlane_records.json if unspecified.""" if args.input is not None: input_path = Path(args.input) if not input_path.exists(): @@ -1137,9 +1138,9 @@ def _resolve_input_path(args): return input_path outputs_dir = Path.cwd() / "outputs" - json_files = list(outputs_dir.glob("*/l2_perf_records.json")) + json_files = list(outputs_dir.glob("*/l2_swimlane_records.json")) if not json_files: - print(f"Error: No outputs/*/l2_perf_records.json found under {outputs_dir}", file=sys.stderr) + print(f"Error: No outputs/*/l2_swimlane_records.json found under {outputs_dir}", file=sys.stderr) print("Run a test with --enable-l2-swimlane first, or specify an explicit input.", file=sys.stderr) return None @@ -1161,11 +1162,11 @@ def _resolve_output_path(args, input_path): def _print_verbose_data_info(data, verbose): """Print verbose summary of loaded performance data, including phase counts - when present (l2_perf_level >= SCHED_PHASES).""" + when present (l2_swimlane_level >= SCHED_PHASES).""" if not verbose: return print("\n=== Performance Data ===") - print(f" L2 perf level: {data['l2_perf_level']}") + print(f" L2 perf level: {data['l2_swimlane_level']}") print(f" Task Count: {len(data['tasks'])}") if data["tasks"]: start_times = [t["start_time_us"] for t in data["tasks"]] diff --git a/src/a2a3/platform/include/aicore/aicore_profiling_state.h b/src/a2a3/platform/include/aicore/aicore_profiling_state.h index b41a60dbb..7d48c91a7 100644 --- a/src/a2a3/platform/include/aicore/aicore_profiling_state.h +++ b/src/a2a3/platform/include/aicore/aicore_profiling_state.h @@ -25,14 +25,14 @@ * * Lifecycle: * 1. Host fills `KernelArgs::enable_profiling_flag` and - * `KernelArgs::aicore_ring_addr` (points to a per-core `AicoreRotation` + * `KernelArgs::l2_swimlane_aicore_rotation_table` (points to a per-core `L2SwimlaneAicoreRotation` * device-address table). Host allocates the table bytes; AICPU populates - * the entries inside `l2_perf_aicpu_init`. - * 2. AICore kernel entry stashes `&aicore_ring_addr[block_idx]` (the slot + * the entries inside `l2_swimlane_aicpu_init`. + * 2. AICore kernel entry stashes `&l2_swimlane_aicore_rotation_table[block_idx]` (the slot * pointer — NOT the dereferenced rotation pointer yet) via - * `set_aicore_rotation_slot()`, and calls `set_aicore_profiling_flag()`, + * `set_l2_swimlane_aicore_rotation_slot()`, and calls `set_aicore_profiling_flag()`, * before invoking `aicore_execute`. - * 3. `get_aicore_rotation()` lazily dereferences the slot the first time + * 3. `get_l2_swimlane_aicore_rotation()` lazily dereferences the slot the first time * it is called. Callers must defer the call until AFTER AICPU has * dispatched the first task (so AICPU init has had a chance to populate * the table). The executor handles this by calling it inside the main @@ -45,7 +45,7 @@ #include #include "aicore/aicore.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" /** * Profiling enable bitmask (umbrella over dump_tensor / l2_swimlane / pmu). @@ -58,20 +58,20 @@ __aicore__ uint32_t get_aicore_profiling_flag(); /** * Per-core AICore rotation channel. * - * `set_aicore_rotation_slot(slot)` stashes the address of THIS core's slot - * in the rotation-address table — `&((uint64_t*)k_args->aicore_ring_addr)[block_idx]`. + * `set_l2_swimlane_aicore_rotation_slot(slot)` stashes the address of THIS core's slot + * in the rotation-address table — `&((uint64_t*)k_args->l2_swimlane_aicore_rotation_table)[block_idx]`. * No dereference happens here, because at kernel entry the AICPU side may * not yet have populated the table (the host launches both kernels and * AICPU's init runs concurrently with AICore's entry). * - * `get_aicore_rotation()` lazily dereferences the stashed slot on first use, + * `get_l2_swimlane_aicore_rotation()` lazily dereferences the stashed slot on first use, * caches the result, and returns it on subsequent calls. Callers MUST defer * the first call until after AICPU has dispatched the first task — by then * AICPU's init has completed and the slot holds a valid device address. * The executor's main loop honours this by reading the rotation only inside * the first-task branch of the dispatch poll. */ -__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr); -__aicore__ __gm__ AicoreRotation *get_aicore_rotation(); +__aicore__ void set_l2_swimlane_aicore_rotation_slot(__gm__ uint64_t *slot_ptr); +__aicore__ __gm__ L2SwimlaneAicoreRotation *get_l2_swimlane_aicore_rotation(); #endif // PLATFORM_AICORE_AICORE_PROFILING_STATE_H_ diff --git a/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h b/src/a2a3/platform/include/aicore/l2_swimlane_collector_aicore.h similarity index 75% rename from src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h rename to src/a2a3/platform/include/aicore/l2_swimlane_collector_aicore.h index c5aaadd0b..c6456dde2 100644 --- a/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h +++ b/src/a2a3/platform/include/aicore/l2_swimlane_collector_aicore.h @@ -9,17 +9,17 @@ * ----------------------------------------------------------------------------------------------------------- */ /** - * @file l2_perf_collector_aicore.h + * @file l2_swimlane_collector_aicore.h * @brief AICore performance data collection interface * * Provides lightweight performance recording interface for AICore kernels. * Uses dcci for efficient cache management instead of memory barriers. */ -#ifndef PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_ -#define PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_ +#ifndef PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_ +#define PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_ -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "aicore/aicore.h" // Include platform-specific timestamp implementation @@ -33,13 +33,13 @@ /** * AICore-local rotation state. Tracks which buffer this core is currently - * writing into and which slot is next. Reset by `l2_perf_aicore_record_task` - * when it observes a generation bump on the shared `AicoreRotation` channel + * writing into and which slot is next. Reset by `l2_swimlane_aicore_record_task` + * when it observes a generation bump on the shared `L2SwimlaneAicoreRotation` channel * (AICPU rotates by writing `current_buf_ptr` + bumping `generation`, so the * AICore-local state self-recovers without any AICore-side spin-wait). */ -struct AicoreLocalState { - __gm__ L2PerfAicoreBuffer *cached_buf = nullptr; +struct L2SwimlaneAicoreLocalState { + __gm__ L2SwimlaneAicoreTaskBuffer *cached_buf = nullptr; // Must start != AICPU's initial generation (1) so the first record_task // call observes a generation mismatch and loads the buffer pointer. uint32_t cached_generation = 0; @@ -49,10 +49,10 @@ struct AicoreLocalState { /** * Record task execution performance data. * - * AICore writes a slim L2PerfAicoreRecord into its currently-published - * per-core L2PerfAicoreBuffer at `records[slot_within_buf++]`. The - * publication channel is an AicoreRotation cache line addressed via - * `KernelArgs::aicore_ring_addr[block_idx]` (now points to AicoreRotation, + * AICore writes a slim L2SwimlaneAicoreTaskRecord into its currently-published + * per-core L2SwimlaneAicoreTaskBuffer at `records[slot_within_buf++]`. The + * publication channel is an L2SwimlaneAicoreRotation cache line addressed via + * `KernelArgs::l2_swimlane_aicore_rotation_table[block_idx]` (now points to L2SwimlaneAicoreRotation, * not directly to a buffer). AICPU updates `rotation->current_buf_ptr` and * bumps `rotation->generation` at dispatch boundaries; AICore detects the * change by `dcci`-ing the rotation line per task and comparing generation @@ -69,22 +69,23 @@ struct AicoreLocalState { * so AICore has already finished writing their records before AICPU enqueues * the old buffer to the ready queue. * - * @param rotation Per-core AicoreRotation channel (cached at kernel entry - * from KernelArgs::aicore_ring_addr[block_idx]) + * @param rotation Per-core L2SwimlaneAicoreRotation channel (cached at kernel entry + * from KernelArgs::l2_swimlane_aicore_rotation_table[block_idx]) * @param local Per-core AICore-local state (caller-owned static) * @param task_id Register dispatch id (DATA_MAIN_BASE), low 32 bits * @param start_time Start timestamp (get_sys_cnt) * @param end_time End timestamp */ -__aicore__ __attribute__((always_inline)) static inline void l2_perf_aicore_record_task( - __gm__ AicoreRotation *rotation, AicoreLocalState *local, uint32_t task_id, uint64_t start_time, uint64_t end_time +__aicore__ __attribute__((always_inline)) static inline void l2_swimlane_aicore_record_task( + __gm__ L2SwimlaneAicoreRotation *rotation, L2SwimlaneAicoreLocalState *local, uint32_t task_id, uint64_t start_time, + uint64_t end_time ) { // Re-fetch rotation channel each task; cheap relative to the // baseline `dcci(payload, ENTIRE_DATA_CACHE)` we already pay per task. dcci(rotation, SINGLE_CACHE_LINE); if (rotation->generation != local->cached_generation) { local->cached_generation = rotation->generation; - local->cached_buf = reinterpret_cast<__gm__ L2PerfAicoreBuffer *>(rotation->current_buf_ptr); + local->cached_buf = reinterpret_cast<__gm__ L2SwimlaneAicoreTaskBuffer *>(rotation->current_buf_ptr); local->slot_within_buf = 0; } if (local->cached_buf == nullptr) { @@ -102,7 +103,7 @@ __aicore__ __attribute__((always_inline)) static inline void l2_perf_aicore_reco return; } - __gm__ L2PerfAicoreRecord *record = &local->cached_buf->records[slot]; + __gm__ L2SwimlaneAicoreTaskRecord *record = &local->cached_buf->records[slot]; record->start_time = start_time; record->end_time = end_time; record->task_id = task_id; @@ -113,4 +114,4 @@ __aicore__ __attribute__((always_inline)) static inline void l2_perf_aicore_reco dsb((mem_dsb_t)0); } -#endif // PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_ +#endif // PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_ diff --git a/src/a2a3/platform/include/aicpu/dep_gen_collector_aicpu.h b/src/a2a3/platform/include/aicpu/dep_gen_collector_aicpu.h index 12e9a74e6..c2a16a859 100644 --- a/src/a2a3/platform/include/aicpu/dep_gen_collector_aicpu.h +++ b/src/a2a3/platform/include/aicpu/dep_gen_collector_aicpu.h @@ -53,7 +53,7 @@ extern "C" bool is_dep_gen_enabled(); * the per-thread ready_queue when buffers fill or on flush. Must be called by * aicpu_executor.cpp before any dep_gen_aicpu_record_submit() can fire. * - * Mirrors l2_perf_aicpu_set_orch_thread_idx(). + * Mirrors l2_swimlane_aicpu_set_orch_thread_idx(). */ void dep_gen_aicpu_set_orch_thread_idx(int thread_idx); diff --git a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h similarity index 66% rename from src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h rename to src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h index 03465e02b..ecfb9723e 100644 --- a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h +++ b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h @@ -9,17 +9,17 @@ * ----------------------------------------------------------------------------------------------------------- */ /** - * @file l2_perf_collector_aicpu.h + * @file l2_swimlane_collector_aicpu.h * @brief AICPU performance data collection interface * * Provides performance profiling management interface for AICPU side. * Handles buffer initialization, switching, and flushing. */ -#ifndef PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_ -#define PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_ +#ifndef PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_ +#define PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_ -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" // Include platform-specific timestamp implementation // Build system selects the correct inner_aicpu.h based on platform: @@ -29,51 +29,51 @@ // ============= Public Interface ============= /** - * L2 perf handshake setters — called by the host (sim) or the AICPU kernel - * entry (onboard) before `l2_perf_aicpu_init()` so AICPU code can read perf + * L2 swimlane handshake setters — called by the host (sim) or the AICPU kernel + * entry (onboard) before `l2_swimlane_aicpu_init()` so AICPU code can read perf * state without reaching into the generic `Runtime` struct. * * Two-channel level transport (mirrors the PMU pattern): * - binary on/off — `enable_profiling_flag` bit1 → `set_l2_swimlane_enabled(bool)` * at kernel entry; queried via `is_l2_swimlane_enabled()`. - * - granular L2PerfLevel — `L2PerfDataHeader::l2_perf_level` - * (shared memory); read in `l2_perf_aicpu_init` and cached, then queried - * via `get_l2_perf_level()` for + * - granular L2SwimlaneLevel — `L2SwimlaneDataHeader::l2_swimlane_level` + * (shared memory); read in `l2_swimlane_aicpu_init` and cached, then queried + * via `get_l2_swimlane_level()` for * `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates. */ -extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base); -extern "C" uint64_t get_platform_l2_perf_base(); +extern "C" void set_platform_l2_swimlane_base(uint64_t l2_swimlane_data_base); +extern "C" uint64_t get_platform_l2_swimlane_base(); extern "C" void set_l2_swimlane_enabled(bool enable); extern "C" bool is_l2_swimlane_enabled(); -// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr). +// AICore rotation-table device pointer (= KernelArgs::l2_swimlane_aicore_rotation_table). // Published by the host before AICPU init runs; AICPU init fills the table -// with the per-core `&L2PerfAicoreBufferState::rotation` device addresses so -// AICore can index `aicore_ring_addr[block_idx]` to find its rotation channel. +// with the per-core `&L2SwimlaneAicoreTaskPool::rotation` device addresses so +// AICore can index `l2_swimlane_aicore_rotation_table[block_idx]` to find its rotation channel. // Moved from host into AICPU so the host stays decoupled from the AICore-side // shared-memory layout (host previously did host-to-device address translation // + reached into get_aicore_buffer_state to fill this). -extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr); -extern "C" uint64_t get_platform_aicore_rotation_table(); +extern "C" void set_platform_l2_swimlane_aicore_rotation_table(uint64_t table_addr); +extern "C" uint64_t get_platform_l2_swimlane_aicore_rotation_table(); // Typed getter for the granular perf_level (promoted from the shared-memory -// header inside l2_perf_aicpu_init). Gate sites should use this so the -// comparison RHS is a named L2PerfLevel constant. -L2PerfLevel get_l2_perf_level(); +// header inside l2_swimlane_aicpu_init). Gate sites should use this so the +// comparison RHS is a named L2SwimlaneLevel constant. +L2SwimlaneLevel get_l2_swimlane_level(); /** * Initialize performance profiling * * Sets up the AICPU buffer pool for each core and initializes tracking state. - * Reads the perf device-base pointer published via `set_platform_l2_perf_base()`. + * Reads the perf device-base pointer published via `set_platform_l2_swimlane_base()`. * * Also primes the per-core AICore rotation channel: pops the initial - * L2PerfAicoreBuffer from L2PerfAicoreBufferState::free_queue and writes its - * address into the AicoreRotation channel that AICore polls per task. + * L2SwimlaneAicoreTaskBuffer from L2SwimlaneAicoreTaskPool::free_queue and writes its + * address into the L2SwimlaneAicoreRotation channel that AICore polls per task. * * @param worker_count Number of AICore workers (cores) to initialize */ -void l2_perf_aicpu_init(int worker_count); +void l2_swimlane_aicpu_init(int worker_count); /** * Rotate the AICore buffer for a given core, if needed. @@ -89,22 +89,22 @@ void l2_perf_aicpu_init(int worker_count); * (and AICore has finished writing their records into the old buffer) before * the old buffer enters the ready queue. * - * Called regardless of l2_perf_level — internally gates on AICORE_TIMING. + * Called regardless of l2_swimlane_level — internally gates on AICORE_TIMING. * * @param core_id Core index * @param thread_idx Owning AICPU thread (target ready-queue) */ -void l2_perf_aicpu_maybe_rotate_aicore(int core_id, int thread_idx); +void l2_swimlane_aicpu_maybe_rotate_aicore(int core_id, int thread_idx); /** - * Complete a L2PerfRecord with AICPU-side metadata after AICore task completion + * Complete a L2SwimlaneAicpuTaskRecord with AICPU-side metadata after AICore task completion * * AICore-as-producer: AICore writes start/end/task_id directly into the - * per-core L2PerfAicoreBuffer at `records[reg_task_id % SIZE]`. AICPU does + * per-core L2SwimlaneAicoreTaskBuffer at `records[reg_task_id % SIZE]`. AICPU does * NOT read that buffer on the hot path — it only writes AICPU-owned fields * (task_id, reg_task_id, func_id, core_type, dispatch_time, finish_time) * here, leaving start/end as zero. The host post-processor joins the AICore - * stream into the L2PerfRecord stream by `reg_task_id` at flush time. + * stream into the L2SwimlaneAicpuTaskRecord stream by `reg_task_id` at flush time. * * Per-core counter accounting: * total_record_count++ — every commit attempt (success or failure) @@ -115,14 +115,14 @@ void l2_perf_aicpu_maybe_rotate_aicore(int core_id, int thread_idx); * @param core_id Core index — used to resolve buffer state and update counters * @param thread_idx Owning AICPU thread (used when rotating records buffer) * @param expected_reg_task_id Register dispatch token (low 32 bits) — written - * into L2PerfRecord.reg_task_id as the join key + * into L2SwimlaneAicpuTaskRecord.reg_task_id as the join key * @param task_id Task identifier to write (PTO2 encoding or plain id) * @param func_id Kernel function identifier * @param core_type Core type (AIC/AIV) * @param dispatch_time AICPU timestamp when task was dispatched * @param finish_time AICPU timestamp when task completion was observed */ -int l2_perf_aicpu_complete_record( +int l2_swimlane_aicpu_complete_task( int core_id, int thread_idx, uint32_t expected_reg_task_id, uint64_t task_id, uint32_t func_id, CoreType core_type, uint64_t dispatch_time, uint64_t finish_time ); @@ -136,24 +136,24 @@ int l2_perf_aicpu_complete_record( * @param cur_thread_cores Array of core IDs managed by this thread * @param core_num Number of cores managed by this thread */ -void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, int core_num); +void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int core_num); /** * Initialize AICPU phase profiling * - * Sets up AicpuPhaseHeader and clears per-thread phase record buffers. - * Must be called once from thread 0 after l2_perf_aicpu_init(). + * Sets up L2SwimlaneAicpuPhaseHeader and clears per-thread phase record buffers. + * Must be called once from thread 0 after l2_swimlane_aicpu_init(). * * @param worker_count Number of AICore workers (cores) — used to resolve - * the phase region's offset relative to the L2Perf base + * the phase region's offset relative to the L2Swimlane base * @param num_sched_threads Number of scheduler threads */ -void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads); +void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads); /** * Record a single scheduler phase * - * Appends an AicpuPhaseRecord to the specified thread's buffer. + * Appends an L2SwimlaneAicpuPhaseRecord to the specified thread's buffer. * Silently drops records when the buffer is full. * * @param thread_idx Scheduler thread index @@ -164,12 +164,12 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads); * @param tasks_processed Number of tasks processed in this batch (scheduler phases), or * full PTO2 task_id encoding (ring_id << 32) | local_id (orchestrator * phases in tensormap_and_ringbuffer) - * @param extra1, extra2 Phase-specific delta counters (see AicpuPhaseRecord doc). + * @param extra1, extra2 Phase-specific delta counters (see L2SwimlaneAicpuPhaseRecord doc). * SCHED_DISPATCH uses extra1=pop_hit, extra2=pop_miss; other * phases pass 0. */ -void l2_perf_aicpu_record_phase( - int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, +void l2_swimlane_aicpu_record_phase( + int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, uint64_t tasks_processed, uint32_t extra1 = 0, uint32_t extra2 = 0 ); @@ -177,22 +177,22 @@ void l2_perf_aicpu_record_phase( * Set orchestrator thread index for per-task phase recording * * Must be called once from the orchestrator thread before any - * l2_perf_aicpu_record_orch_phase() calls. + * l2_swimlane_aicpu_record_orch_phase() calls. * * @param thread_idx Thread index for the orchestrator (typically num_sched_threads) */ -void l2_perf_aicpu_set_orch_thread_idx(int thread_idx); +void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx); /** * Record one orchestrator submit envelope * - * Appends an AicpuPhaseRecord covering an entire submit_task() / alloc_tensors() + * Appends an L2SwimlaneAicpuPhaseRecord covering an entire submit_task() / alloc_tensors() * call. Uses the orchestrator's dedicated buffer slot (set via * set_orch_thread_idx). Per-sub-step phase records (ORCH_SYNC..ORCH_FANIN) * were dropped — the per-step cumulatives (`g_orch_*_cycle`) in the * cold-path log carry the breakdown that those records were duplicating. * - * @param phase_id Always AicpuPhaseId::ORCH_SUBMIT. (Param kept for API + * @param phase_id Always L2SwimlaneAicpuPhaseId::ORCH_SUBMIT. (Param kept for API * stability; legacy values are ignored by the host parser.) * @param start_time Submit start timestamp * @param end_time Submit end timestamp @@ -200,28 +200,28 @@ void l2_perf_aicpu_set_orch_thread_idx(int thread_idx); * @param task_id Task identifier. For tensormap_and_ringbuffer, this is the full PTO2 encoding: * (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler swimlanes. */ -void l2_perf_aicpu_record_orch_phase( - AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id +void l2_swimlane_aicpu_record_orch_phase( + L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id ); /** * Write core-to-thread assignment mapping to shared memory. * - * Callers invoke `l2_perf_aicpu_init_core_assignments(total_cores)` once, then - * `l2_perf_aicpu_write_core_assignments_for_thread(t, ids, n)` for every + * Callers invoke `l2_swimlane_aicpu_init_core_assignments(total_cores)` once, then + * `l2_swimlane_aicpu_write_core_assignments_for_thread(t, ids, n)` for every * scheduler thread. */ -void l2_perf_aicpu_init_core_assignments(int total_cores); -void l2_perf_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num); +void l2_swimlane_aicpu_init_core_assignments(int total_cores); +void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num); /** * Flush remaining phase records for a thread * * Marks the current WRITING phase buffer as READY and enqueues it - * for host collection. Called at thread exit (analogous to l2_perf_aicpu_flush_buffers). + * for host collection. Called at thread exit (analogous to l2_swimlane_aicpu_flush). * * @param thread_idx Thread index (scheduler thread or orchestrator) */ -void l2_perf_aicpu_flush_phase_buffers(int thread_idx); +void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx); -#endif // PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_ +#endif // PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_ diff --git a/src/a2a3/platform/include/common/dep_gen.h b/src/a2a3/platform/include/common/dep_gen.h index 091fd349a..226bf9f2a 100644 --- a/src/a2a3/platform/include/common/dep_gen.h +++ b/src/a2a3/platform/include/common/dep_gen.h @@ -19,7 +19,7 @@ * sole source of truth for fanout edges; the L2 swimlane hot path no longer * carries fanout to keep AICPU off the per-task GM-store critical path. * - * Streaming buffer design mirrors PMU / L2Perf / TensorDump (single source of + * Streaming buffer design mirrors PMU / L2Swimlane / TensorDump (single source of * algorithmic truth in src/a2a3/platform/include/host/profiling_common/profiler_base.h): * * DepGenFreeQueue — SPSC: Host pushes free DepGenBuffers, AICPU pops them. @@ -29,7 +29,7 @@ * * Single-instance: the orchestrator is one AICPU thread, so the BufferState * array has length 1. Kept array-shaped (vs scalar) for symmetry with PMU / - * L2Perf and to match ProfilerBase::for_each_instance. + * L2Swimlane and to match ProfilerBase::for_each_instance. * * Tensor data is captured as opaque 128-byte blobs (`DEP_GEN_TENSOR_SIZE`) * matching the runtime Tensor struct size. The AICPU writer diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h index aa5422484..c9a50f525 100644 --- a/src/a2a3/platform/include/common/kernel_args.h +++ b/src/a2a3/platform/include/common/kernel_args.h @@ -83,17 +83,21 @@ struct KernelArgs { __may_used_by_aicore__ Runtime *runtime_args{nullptr}; // Task runtime in device memory uint64_t regs{0}; // Per-core register base address array (platform-specific) uint64_t ffts_base_addr{0}; // FFTS base address for AICore - uint64_t dump_data_base{0}; // Dump shared memory base address; use explicit flags to detect enablement - uint64_t l2_perf_data_base{0}; // L2 perf shared memory base address; use explicit flags to detect enablement + uint64_t dump_data_base{0}; // Dump shared memory base address; use explicit flags to detect enablement + uint64_t l2_swimlane_data_base{ + 0 + }; // L2 swimlane shared memory base address; use explicit flags to detect enablement uint64_t pmu_data_base{0}; // PMU shared memory base address; use explicit flags to detect enablement uint64_t pmu_reg_addrs{0}; // Per-core PMU MMIO register base address array (onboard only; 0 on sim) uint64_t dep_gen_data_base{0}; // dep_gen shared memory base address; use explicit flags to detect enablement uint64_t scope_stats_data_base{0}; // ScopeStatsBuffer shared memory base; 0 when scope_stats is off. // Allocated by host's ScopeStatsCollector, read+written by AICPU's // scope_stats_collector via set_platform_scope_stats_base. - uint64_t aicore_ring_addr{0}; // Device ptr to a uint64_t[num_aicore] table holding each core's - // L2PerfAicoreBuffer address. AICore kernel entry indexes by block_idx - // and forwards into platform set/get state. 0 when L2 swimlane is off. + uint64_t l2_swimlane_aicore_rotation_table{ + 0 + }; // Device ptr to a uint64_t[num_aicore] table holding each core's + // L2SwimlaneAicoreTaskBuffer address. AICore kernel entry indexes by block_idx + // and forwards into platform set/get state. 0 when L2 swimlane is off. uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 uint32_t enable_profiling_flag{0}; // Profiling umbrella bitmask; dump_tensor|l2_swimlane|pmu|dep_gen|scope_stats diff --git a/src/a2a3/platform/include/common/l2_perf_profiling.h b/src/a2a3/platform/include/common/l2_swimlane_profiling.h similarity index 68% rename from src/a2a3/platform/include/common/l2_perf_profiling.h rename to src/a2a3/platform/include/common/l2_swimlane_profiling.h index ead322823..68d98dd75 100644 --- a/src/a2a3/platform/include/common/l2_perf_profiling.h +++ b/src/a2a3/platform/include/common/l2_swimlane_profiling.h @@ -10,49 +10,49 @@ */ /** - * @file l2_perf_profiling.h + * @file l2_swimlane_profiling.h * @brief Performance profiling data structures * * Architecture: Fixed header + per-core/thread buffer states + optional phase profiling region * * Memory layout (shared memory between Host and Device): * ┌─────────────────────────────────────────────────────────────┐ - * │ L2PerfDataHeader (fixed header) │ + * │ L2SwimlaneDataHeader (fixed header) │ * │ - ReadyQueue (FIFO, capacity=PLATFORM_PROF_READYQUEUE_SIZE)│ * │ - Metadata (num_cores, flags) │ * ├─────────────────────────────────────────────────────────────┤ - * │ L2PerfBufferState[0] (Core 0) │ + * │ L2SwimlaneAicpuTaskPool[0] (Core 0) │ * │ - free_queue: SPSC queue of available buffer pointers │ * │ - current_buf_ptr, current_buf_seq │ * ├─────────────────────────────────────────────────────────────┤ - * │ L2PerfBufferState[1] (Core 1) │ + * │ L2SwimlaneAicpuTaskPool[1] (Core 1) │ * ├─────────────────────────────────────────────────────────────┤ * │ ... │ * ├─────────────────────────────────────────────────────────────┤ - * │ L2PerfBufferState[num_cores-1] │ + * │ L2SwimlaneAicpuTaskPool[num_cores-1] │ * ├─────────────────────────────────────────────────────────────┤ - * │ AicpuPhaseHeader (optional, present when phase profiling) │ + * │ L2SwimlaneAicpuPhaseHeader (optional, present when phase profiling) │ * │ - magic, num_sched_threads, records_per_thread │ * │ - core_to_thread mapping │ * ├─────────────────────────────────────────────────────────────┤ - * │ PhaseBufferState[thread0] │ + * │ L2SwimlaneAicpuPhasePool[thread0] │ * │ - free_queue: SPSC queue of available buffer pointers │ * │ - current_buf_ptr, current_buf_seq │ * ├─────────────────────────────────────────────────────────────┤ - * │ PhaseBufferState[thread1] │ + * │ L2SwimlaneAicpuPhasePool[thread1] │ * ├─────────────────────────────────────────────────────────────┤ * │ ... │ * └─────────────────────────────────────────────────────────────┘ * - * Actual L2PerfBuffer / PhaseBuffer are allocated dynamically by Host + * Actual L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer are allocated dynamically by Host * and pushed into the per-core/thread free_queue. * - * Base size = sizeof(L2PerfDataHeader) + num_cores * sizeof(L2PerfBufferState) - * With phases = Base + sizeof(AicpuPhaseHeader) + num_threads * sizeof(PhaseBufferState) + * Base size = sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool) + * With phases = Base + sizeof(L2SwimlaneAicpuPhaseHeader) + num_threads * sizeof(L2SwimlaneAicpuPhasePool) */ -#ifndef SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_ -#define SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_ +#ifndef SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_ +#define SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_ #include #include @@ -61,7 +61,7 @@ #include "common/platform_config.h" // ============================================================================= -// L2 perf_level — granularity ladder for the L2 swimlane profiler. +// L2 swimlane_level — granularity ladder for the L2 swimlane profiler. // // Each level is a strict superset of the previous: higher levels add the data // described by their name on top of all lower-level data. Naming describes @@ -69,12 +69,12 @@ // naturally — e.g. `if (level >= SCHED_PHASES)` means "this section runs when // scheduler phase records are being collected (or any higher tier)". // -// Transported via `L2PerfDataHeader::l2_perf_level` (host → AICPU, +// Transported via `L2SwimlaneDataHeader::l2_swimlane_level` (host → AICPU, // shared memory) and `CallConfig::enable_l2_swimlane` (Python → C). The wire // representation stays integer (uint32_t / int32_t) for ABI stability; this // enum is the canonical in-code type used for comparisons. // ============================================================================= -enum class L2PerfLevel : uint32_t { +enum class L2SwimlaneLevel : uint32_t { DISABLED = 0, // No collection at all AICORE_TIMING = 1, // AICore per-task start/end timestamps + task record buffer AICPU_TIMING = 2, // + AICPU dispatch/finish timestamps @@ -83,7 +83,7 @@ enum class L2PerfLevel : uint32_t { }; // ============================================================================= -// L2PerfRecord - Single Task Execution Record +// L2SwimlaneAicpuTaskRecord - Single Task Execution Record // ============================================================================= /** @@ -95,7 +95,7 @@ enum class L2PerfLevel : uint32_t { * critical fanin tail. The host swimlane export emits empty fanout * fields; `swimlane_converter.py` joins deps.json at post-process time. */ -struct L2PerfRecord { +struct L2SwimlaneAicpuTaskRecord { // Timing information (device clock timestamps) uint64_t start_time; // Task start timestamp (get_sys_cnt) — host-filled at flush from AICore buffer uint64_t end_time; // Task end timestamp — host-filled at flush from AICore buffer @@ -114,34 +114,37 @@ struct L2PerfRecord { CoreType core_type; // Core type (AIC/AIV) uint32_t reg_task_id; // Register dispatch token (monotonic per core). // Used by the host as the join key against - // L2PerfAicoreRecord.task_id, which is what + // L2SwimlaneAicoreTaskRecord.task_id, which is what // AICore writes into the slim record. } __attribute__((aligned(64))); -static_assert(sizeof(L2PerfRecord) % 64 == 0, "L2PerfRecord must be 64-byte aligned for optimal cache performance"); +static_assert( + sizeof(L2SwimlaneAicpuTaskRecord) % 64 == 0, + "L2SwimlaneAicpuTaskRecord must be 64-byte aligned for optimal cache performance" +); // ============================================================================= -// L2PerfAicoreRecord - Slim AICore-Only Record (written by AICore, read by Host) +// L2SwimlaneAicoreTaskRecord - Slim AICore-Only Record (written by AICore, read by Host) // ============================================================================= /** * Slim per-task record written by AICore directly into its own per-core * output buffer (no staging slot, no AICPU read). AICPU never touches this * record. The host post-processor joins it against the AICPU-side - * L2PerfRecord on `task_id` at flush time. + * L2SwimlaneAicpuTaskRecord on `task_id` at flush time. * * Layout: 24B payload + 8B pad → 32B (half a cache line). Two records pack * into one cache line so AICore's per-task store is at most a single line * commit + dcci. */ -struct L2PerfAicoreRecord { +struct L2SwimlaneAicoreTaskRecord { uint64_t start_time; // Task start timestamp (get_sys_cnt) uint64_t end_time; // Task end timestamp uint32_t task_id; // Register dispatch token (low 32 bits) uint32_t _pad; } __attribute__((aligned(32))); -static_assert(sizeof(L2PerfAicoreRecord) == 32, "L2PerfAicoreRecord must be 32B"); +static_assert(sizeof(L2SwimlaneAicoreTaskRecord) == 32, "L2SwimlaneAicoreTaskRecord must be 32B"); // ============================================================================= // TypedBuffer - Templated Fixed-Size Profiling Buffer @@ -149,13 +152,13 @@ static_assert(sizeof(L2PerfAicoreRecord) == 32, "L2PerfAicoreRecord must be 32B" /** * Generic fixed-capacity profiling buffer: contiguous record array followed - * by a producer-written count. Layout matches the legacy L2PerfBuffer so the + * by a producer-written count. Layout matches the legacy L2SwimlaneAicpuTaskBuffer so the * host allocator and the AICPU consumer can treat all concrete instances * uniformly. * * Concrete instantiations live below as `using` aliases. - * - L2PerfBuffer — AICPU-written, rotated, ready-queue tagged is_phase=0 - * - L2PerfAicoreBuffer — AICore-written, NOT rotated (sized for the full + * - L2SwimlaneAicpuTaskBuffer — AICPU-written, rotated, ready-queue tagged kind=AicpuTask + * - L2SwimlaneAicoreTaskBuffer — AICore-written, NOT rotated (sized for the full * session), read by host at flush time */ template @@ -164,9 +167,9 @@ struct TypedBuffer { volatile uint32_t count; } __attribute__((aligned(64))); -using L2PerfBuffer = TypedBuffer; +using L2SwimlaneAicpuTaskBuffer = TypedBuffer; -// AICore buffer is rotated like L2PerfBuffer: a small fixed capacity per +// AICore buffer is rotated like L2SwimlaneAicpuTaskBuffer: a small fixed capacity per // buffer plus a per-core pool, so an arbitrarily long session never wraps. // Per-buffer capacity is a power of two so the AICore-local // `slot_within_buf` increment lowers to a bitwise AND for boundary checks. @@ -180,10 +183,10 @@ static_assert( // ready-queue capacity formula there can include the AICore pool's worst-case // burst depth alongside the AICPU and Phase pools. -using L2PerfAicoreBuffer = TypedBuffer; +using L2SwimlaneAicoreTaskBuffer = TypedBuffer; // ============================================================================= -// L2PerfFreeQueue - SPSC Lock-Free Queue for Free Buffers +// L2SwimlaneFreeQueue - SPSC Lock-Free Queue for Free Buffers // ============================================================================= /** @@ -201,17 +204,17 @@ using L2PerfAicoreBuffer = TypedBuffer; /** * AICPU phase profiling header * - * Located after the L2PerfBufferState array in shared memory. + * Located after the L2SwimlaneAicpuTaskPool array in shared memory. * Contains metadata and per-thread tracking. */ -struct AicpuPhaseHeader { - uint32_t magic; // Validation magic (AICPU_PHASE_MAGIC) +struct L2SwimlaneAicpuPhaseHeader { + uint32_t magic; // Validation magic (L2_SWIMLANE_AICPU_PHASE_MAGIC) uint32_t num_sched_threads; // Number of scheduler threads - uint32_t records_per_thread; // Max records per PhaseBuffer + uint32_t records_per_thread; // Max records per L2SwimlaneAicpuPhaseBuffer uint32_t num_cores; // Total number of cores with valid assignments int8_t core_to_thread[PLATFORM_MAX_CORES]; // core_id → scheduler thread index (-1 = unassigned) } __attribute__((aligned(64))); @@ -492,41 +476,45 @@ extern "C" { * Calculate total memory size for performance data (buffer states only, no buffers) * * Formula: Total size = Fixed header + Dynamic tail - * = sizeof(L2PerfDataHeader) + num_cores × sizeof(L2PerfBufferState) + * = sizeof(L2SwimlaneDataHeader) + num_cores × sizeof(L2SwimlaneAicpuTaskPool) * * @param num_cores Number of cores (block_dim × PLATFORM_CORES_PER_BLOCKDIM) * @return Total bytes for header + buffer states */ inline size_t calc_perf_data_size(int num_cores) { - return sizeof(L2PerfDataHeader) + num_cores * sizeof(L2PerfBufferState); + return sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool); } /** * Get header pointer * * @param base_ptr Shared memory base address (device_ptr or host_ptr) - * @return L2PerfDataHeader pointer + * @return L2SwimlaneDataHeader pointer */ -inline L2PerfDataHeader *get_l2_perf_header(void *base_ptr) { return reinterpret_cast(base_ptr); } +inline L2SwimlaneDataHeader *get_l2_swimlane_header(void *base_ptr) { + return reinterpret_cast(base_ptr); +} /** - * Get L2PerfBufferState array start address + * Get L2SwimlaneAicpuTaskPool array start address * * @param base_ptr Shared memory base address - * @return L2PerfBufferState array pointer + * @return L2SwimlaneAicpuTaskPool array pointer */ -inline L2PerfBufferState *get_perf_buffer_states(void *base_ptr) { - return reinterpret_cast(reinterpret_cast(base_ptr) + sizeof(L2PerfDataHeader)); +inline L2SwimlaneAicpuTaskPool *get_perf_buffer_states(void *base_ptr) { + return reinterpret_cast( + reinterpret_cast(base_ptr) + sizeof(L2SwimlaneDataHeader) + ); } /** - * Get L2PerfBufferState for specified core + * Get L2SwimlaneAicpuTaskPool for specified core * * @param base_ptr Shared memory base address * @param core_index Core index (0 ~ num_cores-1) - * @return L2PerfBufferState pointer + * @return L2SwimlaneAicpuTaskPool pointer */ -inline L2PerfBufferState *get_perf_buffer_state(void *base_ptr, int core_index) { +inline L2SwimlaneAicpuTaskPool *get_perf_buffer_state(void *base_ptr, int core_index) { return &get_perf_buffer_states(base_ptr)[core_index]; } @@ -534,55 +522,55 @@ inline L2PerfBufferState *get_perf_buffer_state(void *base_ptr, int core_index) * Calculate total memory size including AICore states and phase profiling * region (buffer states only, not the record payloads themselves). * - * Layout (after the fixed L2PerfDataHeader): - * [L2PerfBufferState × num_cores] - * [L2PerfAicoreBufferState × num_cores] - * [AicpuPhaseHeader] - * [PhaseBufferState × num_sched_threads] + * Layout (after the fixed L2SwimlaneDataHeader): + * [L2SwimlaneAicpuTaskPool × num_cores] + * [L2SwimlaneAicoreTaskPool × num_cores] + * [L2SwimlaneAicpuPhaseHeader] + * [L2SwimlaneAicpuPhasePool × num_sched_threads] * * @param num_cores Number of AICore instances * @param num_sched_threads Number of phase profiling threads (scheduler + orchestrator) * @return Total bytes needed for header + all buffer states */ inline size_t calc_perf_data_size_with_phases(int num_cores, int num_sched_threads) { - return calc_perf_data_size(num_cores) + num_cores * sizeof(L2PerfAicoreBufferState) + sizeof(AicpuPhaseHeader) + - num_sched_threads * sizeof(PhaseBufferState); + return calc_perf_data_size(num_cores) + num_cores * sizeof(L2SwimlaneAicoreTaskPool) + + sizeof(L2SwimlaneAicpuPhaseHeader) + num_sched_threads * sizeof(L2SwimlaneAicpuPhasePool); } /** - * Get L2PerfAicoreBufferState array start address (located immediately - * after the L2PerfBufferState array, before the AicpuPhaseHeader). + * Get L2SwimlaneAicoreTaskPool array start address (located immediately + * after the L2SwimlaneAicpuTaskPool array, before the L2SwimlaneAicpuPhaseHeader). */ -inline L2PerfAicoreBufferState *get_aicore_buffer_states(void *base_ptr, int num_cores) { - return reinterpret_cast( +inline L2SwimlaneAicoreTaskPool *get_aicore_buffer_states(void *base_ptr, int num_cores) { + return reinterpret_cast( reinterpret_cast(base_ptr) + calc_perf_data_size(num_cores) ); } -inline L2PerfAicoreBufferState *get_aicore_buffer_state(void *base_ptr, int num_cores, int core_index) { +inline L2SwimlaneAicoreTaskPool *get_aicore_buffer_state(void *base_ptr, int num_cores, int core_index) { return &get_aicore_buffer_states(base_ptr, num_cores)[core_index]; } /** - * Get AicpuPhaseHeader pointer (located after the L2PerfAicoreBufferState array). + * Get L2SwimlaneAicpuPhaseHeader pointer (located after the L2SwimlaneAicoreTaskPool array). */ -inline AicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) { - return reinterpret_cast( +inline L2SwimlaneAicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) { + return reinterpret_cast( reinterpret_cast(base_ptr) + calc_perf_data_size(num_cores) + - num_cores * sizeof(L2PerfAicoreBufferState) + num_cores * sizeof(L2SwimlaneAicoreTaskPool) ); } /** - * Get PhaseBufferState array start address (located after AicpuPhaseHeader) + * Get L2SwimlaneAicpuPhasePool array start address (located after L2SwimlaneAicpuPhaseHeader) */ -inline PhaseBufferState *get_phase_buffer_states(void *base_ptr, int num_cores) { - return reinterpret_cast( - reinterpret_cast(get_phase_header(base_ptr, num_cores)) + sizeof(AicpuPhaseHeader) +inline L2SwimlaneAicpuPhasePool *get_phase_buffer_states(void *base_ptr, int num_cores) { + return reinterpret_cast( + reinterpret_cast(get_phase_header(base_ptr, num_cores)) + sizeof(L2SwimlaneAicpuPhaseHeader) ); } -inline PhaseBufferState *get_phase_buffer_state(void *base_ptr, int num_cores, int thread_idx) { +inline L2SwimlaneAicpuPhasePool *get_phase_buffer_state(void *base_ptr, int num_cores, int thread_idx) { return &get_phase_buffer_states(base_ptr, num_cores)[thread_idx]; } @@ -590,4 +578,4 @@ inline PhaseBufferState *get_phase_buffer_state(void *base_ptr, int num_cores, i } #endif -#endif // SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_ +#endif // SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_ diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h index a514503a2..e070396b1 100644 --- a/src/a2a3/platform/include/common/platform_config.h +++ b/src/a2a3/platform/include/common/platform_config.h @@ -104,7 +104,7 @@ constexpr int PLATFORM_MAX_CORES = PLATFORM_MAX_BLOCKDIM * PLATFORM_CORES_PER_BL /** * Performance buffer capacity per buffer - * Number of L2PerfRecord entries per dynamically allocated L2PerfBuffer + * Number of L2SwimlaneAicpuTaskRecord entries per dynamically allocated L2SwimlaneAicpuTaskBuffer */ constexpr int PLATFORM_PROF_BUFFER_SIZE = 1000; @@ -118,13 +118,13 @@ constexpr int PLATFORM_PROF_BUFFER_SIZE = 1000; constexpr int PLATFORM_PROF_SLOT_COUNT = 4; /** - * L2PerfBuffer pre-allocation count per AICore. + * L2SwimlaneAicpuTaskBuffer pre-allocation count per AICore. * 1 goes into the free_queue at init, the rest into the recycled pool. */ constexpr int PLATFORM_PROF_BUFFERS_PER_CORE = 8; /** - * L2PerfAicoreBuffer pre-allocation count per AICore (AICore-as-producer pool). + * L2SwimlaneAicoreTaskBuffer pre-allocation count per AICore (AICore-as-producer pool). * 1 goes into the free_queue at init, the rest into the recycled pool. * Mirrors PLATFORM_PROF_BUFFERS_PER_CORE in role; smaller because AICore records * are slim (32 B each) and the buffer is also smaller per the rotation design. @@ -132,7 +132,7 @@ constexpr int PLATFORM_PROF_BUFFERS_PER_CORE = 8; constexpr int PLATFORM_AICORE_BUFFERS_PER_CORE = 4; /** - * PhaseBuffer pre-allocation count per AICPU thread. + * L2SwimlaneAicpuPhaseBuffer pre-allocation count per AICPU thread. * 1 goes into the free_queue at init, the rest into the recycled pool. */ constexpr int PLATFORM_PROF_BUFFERS_PER_THREAD = 16; @@ -141,8 +141,8 @@ constexpr int PLATFORM_PROF_BUFFERS_PER_THREAD = 16; * Ready queue capacity for performance data collection. * Queue holds ReadyQueueEntry structs for buffers ready to be read by Host. * Sized to match pre-allocation total across all cores and threads, summed - * over the three buffer kinds (AICPU L2PerfBuffer, PhaseBuffer, - * AICore L2PerfAicoreBuffer). + * over the three buffer kinds (AICPU L2SwimlaneAicpuTaskBuffer, L2SwimlaneAicpuPhaseBuffer, + * AICore L2SwimlaneAicoreTaskBuffer). */ constexpr int PLATFORM_PROF_READYQUEUE_SIZE = PLATFORM_MAX_CORES * PLATFORM_PROF_BUFFERS_PER_CORE + PLATFORM_MAX_AICPU_THREADS * PLATFORM_PROF_BUFFERS_PER_THREAD + diff --git a/src/a2a3/platform/include/common/pmu_profiling.h b/src/a2a3/platform/include/common/pmu_profiling.h index 846e87b6a..9c6def972 100644 --- a/src/a2a3/platform/include/common/pmu_profiling.h +++ b/src/a2a3/platform/include/common/pmu_profiling.h @@ -17,7 +17,7 @@ * Software License 2.0). Register offsets live in platform_config.h and are * accessed via RegId / reg_index(). * - * Streaming buffer design (mirrors l2_perf_profiling.h): + * Streaming buffer design (mirrors l2_swimlane_profiling.h): * PmuFreeQueue — SPSC queue: Host pushes free PmuBuffers, AICPU pops them. * PmuBufferState — Per-core state: current active buffer pointer + free_queue. * PmuDataHeader — Fixed shared-memory header: per-thread ready queues. @@ -54,7 +54,7 @@ constexpr uint32_t PMU_EVENT_TYPE_DEFAULT = static_cast(PmuEventType:: /** * Event ID table for a single event type. - * `event_ids[i]` programs PMU_CNTi_IDX; `counters[i]` in the L2PerfRecord is the + * `event_ids[i]` programs PMU_CNTi_IDX; `pmu_counters[i]` in the PmuRecord is the * value of PMU_CNTi after the task completes. * `counter_names[i]` is the human-readable CSV column name for counter i. * Empty string ("") marks an unused slot. @@ -134,7 +134,7 @@ inline const PmuEventConfig *pmu_resolve_event_config_a2a3(PmuEventType event_ty * Per-task PMU snapshot written by AICPU after each AICore task FIN. */ struct PmuRecord { - uint64_t task_id; // Same encoding as L2PerfRecord.task_id + uint64_t task_id; // Same encoding as L2SwimlaneAicpuTaskRecord.task_id uint32_t func_id; // Kernel function identifier CoreType core_type; // AIC or AIV uint64_t pmu_total_cycles; // PMU_CNT_TOTAL (64-bit combined) @@ -142,7 +142,7 @@ struct PmuRecord { } __attribute__((aligned(64))); // ============================================================================= -// PMU Streaming Buffer Structures (mirrors l2_perf_profiling.h) +// PMU Streaming Buffer Structures (mirrors l2_swimlane_profiling.h) // ============================================================================= /** diff --git a/src/a2a3/platform/include/common/scope_stats.h b/src/a2a3/platform/include/common/scope_stats.h index 88efa72dd..844e34089 100644 --- a/src/a2a3/platform/include/common/scope_stats.h +++ b/src/a2a3/platform/include/common/scope_stats.h @@ -17,7 +17,7 @@ * scope_end — each carrying the task/heap ring start/end and the tensormap * live-entry count sampled at that boundary, tagged with a phase flag. Records * stream off the device in - * fixed-capacity buffers, mirroring PMU / dep_gen / tensor_dump / l2_perf (the + * fixed-capacity buffers, mirroring PMU / dep_gen / tensor_dump / l2_swimlane (the * single source of mgmt-loop truth is * src/a2a3/platform/include/host/profiling_common/profiler_base.h): * diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h index 48afba1cf..afbebcb27 100644 --- a/src/a2a3/platform/include/common/tensor_dump.h +++ b/src/a2a3/platform/include/common/tensor_dump.h @@ -139,7 +139,7 @@ struct DumpMetaBuffer { /** * Single Producer Single Consumer (SPSC) lock-free queue. - * Same layout and semantics as L2PerfFreeQueue, separate type for decoupling. + * Same layout and semantics as L2SwimlaneFreeQueue, separate type for decoupling. * * Producer: Host (DumpMemoryManager thread) pushes recycled/new buffers * Consumer: Device (AICPU thread) pops buffers when switching diff --git a/src/a2a3/platform/include/host/dep_gen_collector.h b/src/a2a3/platform/include/host/dep_gen_collector.h index 5c48723df..ae036683c 100644 --- a/src/a2a3/platform/include/host/dep_gen_collector.h +++ b/src/a2a3/platform/include/host/dep_gen_collector.h @@ -264,7 +264,7 @@ class DepGenCollector : public profiling_common::ProfilerBase: shared mgmt-thread infrastructure that polls + * - BufferPoolManager: shared mgmt-thread infrastructure that polls * the AICPU ready queue, replenishes per-core / per-thread free queues, and * hands full buffers off to the collector thread. - * - L2PerfCollector: main thread copies records from the manager's ready queue + * - L2SwimlaneCollector: main thread copies records from the manager's ready queue * into host vectors and exports the swimlane visualization. * * Memory operations are injected through callbacks for sim/onboard portability. */ -#ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_ -#define SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_ +#ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_ +#define SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_ #include #include @@ -33,33 +33,33 @@ #include #include -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "common/unified_log.h" #include "host/profiling_common/profiler_base.h" // --------------------------------------------------------------------------- -// L2 Perf profiling Module (drives BufferPoolManager) +// L2 Perf profiling Module (drives BufferPoolManager) // --------------------------------------------------------------------------- /** * L2 Perf has two distinct buffer kinds going through one ready queue per * AICPU thread: - * - kind 0: per-core L2PerfBuffer (task records) - * - kind 1: per-thread PhaseBuffer (scheduler/orchestrator phase records) - * The ReadyQueueEntry::is_phase flag picks between them. + * - kind 0: per-core L2SwimlaneAicpuTaskBuffer (task records) + * - kind 1: per-thread L2SwimlaneAicpuPhaseBuffer (scheduler/orchestrator phase records) + * The ReadyQueueEntry::kind flag picks between them. */ /** * Buffer kind discriminator carried in ReadyBufferInfo and used to index the * per-kind recycled pool inside BufferPoolManager. - * PERF_RECORD: per-core AICPU-written L2PerfBuffer - * PHASE: per-thread AICPU-written PhaseBuffer - * AICORE: per-core AICore-written L2PerfAicoreBuffer (rotation driven + * PERF_RECORD: per-core AICPU-written L2SwimlaneAicpuTaskBuffer + * PHASE: per-thread AICPU-written L2SwimlaneAicpuPhaseBuffer + * AICORE: per-core AICore-written L2SwimlaneAicoreTaskBuffer (rotation driven * by AICPU at dispatch boundaries) */ -enum class ProfBufferType { PERF_RECORD = 0, PHASE = 1, AICORE = 2 }; +enum class ProfBufferType { AICPU_TASK = 0, AICPU_PHASE = 1, AICORE_TASK = 2 }; /** * Information about a ready (full) buffer, passed from mgmt thread to main thread. @@ -73,16 +73,16 @@ struct ReadyBufferInfo { uint32_t buffer_seq; // Sequence number for ordering }; -struct L2PerfModule { - using DataHeader = L2PerfDataHeader; +struct L2SwimlaneModule { + using DataHeader = L2SwimlaneDataHeader; using ReadyEntry = ReadyQueueEntry; using ReadyBufferInfo = ::ReadyBufferInfo; - using FreeQueue = L2PerfFreeQueue; // PhaseBufferState aliases L2PerfBufferState + using FreeQueue = L2SwimlaneFreeQueue; // L2SwimlaneAicpuPhasePool aliases L2SwimlaneAicpuTaskPool static constexpr int kBufferKinds = 3; // 0=PERF_RECORD, 1=PHASE, 2=AICORE static constexpr uint32_t kReadyQueueSize = PLATFORM_PROF_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_PROF_SLOT_COUNT; - static constexpr const char *kSubsystemName = "L2PerfModule"; + static constexpr const char *kSubsystemName = "L2SwimlaneModule"; /** * batch_size for proactive_replenish's alloc fallback. Sized so that a @@ -99,31 +99,34 @@ struct L2PerfModule { static int kind_of(const ReadyBufferInfo &info) { return static_cast(info.type); } - static DataHeader *header_from_shm(void *shm) { return get_l2_perf_header(shm); } + static DataHeader *header_from_shm(void *shm) { return get_l2_swimlane_header(shm); } /** - * Branch on entry.is_phase (kind discriminator 0/1/2) to pick the - * per-core perf state vs. the per-thread phase state vs. the per-core - * AICore state. Returns nullopt for out-of-range indices. + * Branch on entry.kind to pick the per-core perf state vs. the per-thread + * phase state vs. the per-core AICore state. Returns nullopt for + * out-of-range indices. */ - static std::optional> + static std::optional> resolve_entry(void *shm, DataHeader *header, int /*q*/, const ReadyEntry &entry) { const int num_cores = static_cast(header->num_cores); - const uint32_t kind = entry.is_phase; + const L2SwimlaneBufferKind kind = entry.kind; - if (kind == 1) { + if (kind == L2SwimlaneBufferKind::AicpuPhase) { if (entry.core_index >= static_cast(PLATFORM_MAX_AICPU_THREADS)) { - LOG_ERROR("L2PerfModule: invalid phase entry: thread=%u", entry.core_index); + LOG_ERROR("L2SwimlaneModule: invalid phase entry: thread=%u", entry.core_index); return std::nullopt; } } else { if (entry.core_index >= static_cast(num_cores)) { - LOG_ERROR("L2PerfModule: invalid perf entry: core=%u kind=%u", entry.core_index, kind); + LOG_ERROR( + "L2SwimlaneModule: invalid perf entry: core=%u kind=%u", entry.core_index, + static_cast(kind) + ); return std::nullopt; } } - profiling_common::EntrySite site; + profiling_common::EntrySite site; site.kind = static_cast(kind); site.info.index = entry.core_index; site.info.slot_idx = 0; @@ -131,22 +134,23 @@ struct L2PerfModule { site.info.host_buffer_ptr = nullptr; // filled by ProfilerAlgorithms site.info.buffer_seq = entry.buffer_seq; - if (kind == 0) { - L2PerfBufferState *state = get_perf_buffer_state(shm, static_cast(entry.core_index)); + if (kind == L2SwimlaneBufferKind::AicpuTask) { + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm, static_cast(entry.core_index)); site.free_queue = &state->free_queue; - site.buffer_size = sizeof(L2PerfBuffer); - site.info.type = ProfBufferType::PERF_RECORD; - } else if (kind == 1) { - PhaseBufferState *state = get_phase_buffer_state(shm, num_cores, static_cast(entry.core_index)); + site.buffer_size = sizeof(L2SwimlaneAicpuTaskBuffer); + site.info.type = ProfBufferType::AICPU_TASK; + } else if (kind == L2SwimlaneBufferKind::AicpuPhase) { + L2SwimlaneAicpuPhasePool *state = + get_phase_buffer_state(shm, num_cores, static_cast(entry.core_index)); site.free_queue = &state->free_queue; - site.buffer_size = sizeof(PhaseBuffer); - site.info.type = ProfBufferType::PHASE; - } else { // kind == 2 (AICORE) - L2PerfAicoreBufferState *ac_state = + site.buffer_size = sizeof(L2SwimlaneAicpuPhaseBuffer); + site.info.type = ProfBufferType::AICPU_PHASE; + } else { // L2SwimlaneBufferKind::AicoreTask + L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(shm, num_cores, static_cast(entry.core_index)); site.free_queue = &ac_state->free_queue; - site.buffer_size = sizeof(L2PerfAicoreBuffer); - site.info.type = ProfBufferType::AICORE; + site.buffer_size = sizeof(L2SwimlaneAicoreTaskBuffer); + site.info.type = ProfBufferType::AICORE_TASK; } return site; } @@ -157,23 +161,24 @@ struct L2PerfModule { // Per-core perf states (kind 0) for (int i = 0; i < num_cores; i++) { - L2PerfBufferState *state = get_perf_buffer_state(shm, i); - cb(/*kind=*/0, &state->free_queue, sizeof(L2PerfBuffer)); + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm, i); + cb(/*kind=*/0, &state->free_queue, sizeof(L2SwimlaneAicpuTaskBuffer)); } // Per-core AICore states (kind 2) for (int i = 0; i < num_cores; i++) { - L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(shm, num_cores, i); - cb(/*kind=*/2, &ac_state->free_queue, sizeof(L2PerfAicoreBuffer)); + L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(shm, num_cores, i); + cb(/*kind=*/2, &ac_state->free_queue, sizeof(L2SwimlaneAicoreTaskBuffer)); } - // Per-thread phase states (kind 1) — gated on AicpuPhaseHeader being + // Per-thread phase states (kind 1) — gated on L2SwimlaneAicpuPhaseHeader being // initialized (runtimes that don't emit phase records leave it zero). - AicpuPhaseHeader *ph = get_phase_header(shm, num_cores); - const int num_phase_threads = (ph->magic == AICPU_PHASE_MAGIC) ? static_cast(ph->num_sched_threads) : 0; + L2SwimlaneAicpuPhaseHeader *ph = get_phase_header(shm, num_cores); + const int num_phase_threads = + (ph->magic == L2_SWIMLANE_AICPU_PHASE_MAGIC) ? static_cast(ph->num_sched_threads) : 0; for (int t = 0; t < num_phase_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(shm, num_cores, t); - cb(/*kind=*/1, &state->free_queue, sizeof(PhaseBuffer)); + L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm, num_cores, t); + cb(/*kind=*/1, &state->free_queue, sizeof(L2SwimlaneAicpuPhaseBuffer)); } } }; @@ -182,13 +187,13 @@ struct L2PerfModule { // alloc / free are std::function so callers bind their MemoryAllocator via // lambda capture; register / unregister stay as plain function pointers // because they wrap stateless HAL globals (halHost*). -using L2PerfAllocCallback = profiling_common::ProfAllocCallback; -using L2PerfRegisterCallback = profiling_common::ProfRegisterCallback; -using L2PerfUnregisterCallback = profiling_common::ProfUnregisterCallback; -using L2PerfFreeCallback = profiling_common::ProfFreeCallback; +using L2SwimlaneAllocCallback = profiling_common::ProfAllocCallback; +using L2SwimlaneRegisterCallback = profiling_common::ProfRegisterCallback; +using L2SwimlaneUnregisterCallback = profiling_common::ProfUnregisterCallback; +using L2SwimlaneFreeCallback = profiling_common::ProfFreeCallback; // ============================================================================= -// L2PerfCollector +// L2SwimlaneCollector // ============================================================================= /** @@ -206,7 +211,7 @@ using L2PerfFreeCallback = profiling_common::ProfFreeCallback; * (mgmt first so its final-drain entries * have a consumer). * 5. read_phase_header_metadata() — single-shot read of the core→thread - * mapping from AicpuPhaseHeader. + * mapping from L2SwimlaneAicpuPhaseHeader. * 6. reconcile_counters() — device-side three-bucket accounting for * both PERF and PHASE pools (total / * collected / dropped). @@ -216,33 +221,33 @@ using L2PerfFreeCallback = profiling_common::ProfFreeCallback; * device flush is the only data path. Any non-zero `current_buf_ptr` after * stop() is logged as a bug. */ -class L2PerfCollector : public profiling_common::ProfilerBase { +class L2SwimlaneCollector : public profiling_common::ProfilerBase { public: - L2PerfCollector() = default; - ~L2PerfCollector(); + L2SwimlaneCollector() = default; + ~L2SwimlaneCollector(); - L2PerfCollector(const L2PerfCollector &) = delete; - L2PerfCollector &operator=(const L2PerfCollector &) = delete; + L2SwimlaneCollector(const L2SwimlaneCollector &) = delete; + L2SwimlaneCollector &operator=(const L2SwimlaneCollector &) = delete; // ProfilerBase contract static constexpr int kIdleTimeoutSec = PLATFORM_PROF_TIMEOUT_SECONDS; - static constexpr const char *kSubsystemName = "L2Perf"; + static constexpr const char *kSubsystemName = "L2Swimlane"; /** * Initialize performance profiling. * * Allocates the shared-memory region (header + per-core / per-thread - * BufferStates), pre-allocates initial L2PerfBuffers and PhaseBuffers, + * BufferStates), pre-allocates initial L2SwimlaneAicpuTaskBuffers and PhaseBuffers, * and seeds the per-pool free_queues + the framework's recycled pools. * * @param num_aicore Number of AICore instances * @param device_id Device ID (forwarded to register_cb) - * @param l2_perf_level Collection granularity (DISABLED / AICORE_TIMING + * @param l2_swimlane_level Collection granularity (DISABLED / AICORE_TIMING * / AICPU_TIMING / SCHED_PHASES / ORCH_PHASES). * Written into - * `L2PerfDataHeader::l2_perf_level` + * `L2SwimlaneDataHeader::l2_swimlane_level` * so AICPU can promote it in - * `l2_perf_aicpu_init`, AND cached on the + * `l2_swimlane_aicpu_init`, AND cached on the * collector so `export_swimlane_json()` * can gate phase sections and stamp the * JSON `version`. @@ -251,28 +256,28 @@ class L2PerfCollector : public profiling_common::ProfilerBase/l2_perf_records.json — directory is captured at + * Writes /l2_swimlane_records.json — directory is captured at * initialize() time. * * @return 0 on success, error code on failure @@ -288,7 +293,7 @@ class L2PerfCollector : public profiling_common::ProfilerBase> &get_records() const { return collected_perf_records_; } + const std::vector> &get_records() const { return collected_perf_records_; } private: // Shared memory pointers. shm_host_ / device_id_ live on ProfilerBase // (set via set_memory_context in initialize()). void *perf_shared_mem_dev_{nullptr}; - // Standalone uint64_t[num_aicore] table holding per-core L2PerfAicoreBuffer + // Standalone uint64_t[num_aicore] table holding per-core L2SwimlaneAicoreTaskBuffer // addresses. Allocated in initialize(), freed in finalize(). AICore reads - // ring_table[block_idx] via KernelArgs::aicore_ring_addr. + // ring_table[block_idx] via KernelArgs::l2_swimlane_aicore_rotation_table. void *aicore_ring_addr_table_dev_{nullptr}; int num_aicore_{0}; - L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; + L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED}; // Per-task output directory captured at initialize() time. Consumed by - // export_swimlane_json() to build /l2_perf_records.json. + // export_swimlane_json() to build /l2_swimlane_records.json. std::string output_prefix_; // Collected data (per-core vectors, indexed by core_index) - std::vector> collected_perf_records_; + std::vector> collected_perf_records_; // Collected AICore records (per-core vectors). Each entry is a full - // L2PerfAicoreRecord captured from a rotated L2PerfAicoreBuffer. The + // L2SwimlaneAicoreTaskRecord captured from a rotated L2SwimlaneAicoreTaskBuffer. The // order across rotations is preserved by `copy_aicore_buffer` (we sort // incoming buffers by buffer_seq before flattening). - std::vector> collected_aicore_records_; + std::vector> collected_aicore_records_; // AICPU phase profiling data (per-thread, mixed sched + orch records) - std::vector> collected_phase_records_; + std::vector> collected_phase_records_; bool has_phase_data_{false}; // Core-to-thread mapping (core_id → scheduler thread index, -1 = unassigned) @@ -374,7 +380,7 @@ class L2PerfCollector : public profiling_common::ProfilerBaseaicore_ring_ptr). AICPU never reads it on the hot path. + // per-core L2SwimlaneAicoreTaskBuffer (allocated by initialize(), addressed via + // L2SwimlaneAicoreTaskPool::rotation, which AICPU rotates per BUFFER_SIZE + // completion). AICPU never reads the AICore records on the hot path. // join_aicore_records() runs after stop(): it walks each core's buffer, // builds a `task_id_low32 → (start, end)` map, then patches the matching - // L2PerfRecord entries in collected_perf_records_. Called from + // L2SwimlaneAicpuTaskRecord entries in collected_perf_records_. Called from // export_swimlane_json() so external callers see a transparent stream. void join_aicore_records(); }; -#endif // SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_ +#endif // SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_ diff --git a/src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h b/src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h index 670803f9f..54673c07a 100644 --- a/src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h +++ b/src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h @@ -11,7 +11,7 @@ /** * @file buffer_pool_manager.h - * @brief Generic buffer-pool data structure shared by L2Perf, TensorDump, + * @brief Generic buffer-pool data structure shared by L2Swimlane, TensorDump, * and PMU collectors. Owns: * * - ready_queue (mgmt → collector) with mutex/cv, @@ -333,7 +333,7 @@ class BufferPoolManager { // dev → host mapping (single source of truth for resolve_host_ptr) std::unordered_map dev_to_host_; - // Per-kind recycled buffer pools (vector indexed by Module's BufferKind id) + // Per-kind recycled buffer pools (vector indexed by Module-defined kind id) std::vector> recycled_; }; diff --git a/src/a2a3/platform/include/host/profiling_common/profiler_base.h b/src/a2a3/platform/include/host/profiling_common/profiler_base.h index fbf4cebe2..e9b06f7d1 100644 --- a/src/a2a3/platform/include/host/profiling_common/profiler_base.h +++ b/src/a2a3/platform/include/host/profiling_common/profiler_base.h @@ -11,7 +11,7 @@ /** * @file profiler_base.h - * @brief CRTP scaffolding shared by L2Perf / Dump / PMU collectors. + * @brief CRTP scaffolding shared by L2Swimlane / Dump / PMU collectors. * * Owns the BufferPoolManager, the mgmt thread (which polls AICPU * ready queues and recycles buffers), and the collector poll thread. @@ -19,12 +19,12 @@ * Module concept contract * ----------------------- * - * Each profiling subsystem provides a `Module` struct (e.g., L2PerfModule, + * Each profiling subsystem provides a `Module` struct (e.g., L2SwimlaneModule, * DumpModule, PmuModule) that supplies the data-layout traits the unified * mgmt-loop algorithms (ProfilerAlgorithms) need. Required members: * * // Types - * using DataHeader = ...; // Shared-memory header (e.g. L2PerfDataHeader). + * using DataHeader = ...; // Shared-memory header (e.g. L2SwimlaneDataHeader). * using ReadyEntry = ...; // Per-AICPU-thread ready-queue entry. * using ReadyBufferInfo = ...; // Hand-off struct to the collector thread * // (carries dev/host ptrs, optional kind @@ -34,10 +34,10 @@ * // `buffer_ptrs[kSlotCount]`. * * // Constants - * static constexpr int kBufferKinds; // L2Perf=2 (perf+phase), Dump=1, PMU=1. + * static constexpr int kBufferKinds; // L2Swimlane=2 (perf+phase), Dump=1, PMU=1. * static constexpr uint32_t kReadyQueueSize; // Per-thread ready-queue depth. * static constexpr uint32_t kSlotCount; // FreeQueue::buffer_ptrs[] length. - * static constexpr const char* kSubsystemName; // "PMU" / "L2Perf" / "Dump". + * static constexpr const char* kSubsystemName; // "PMU" / "L2Swimlane" / "Dump". * * // Header pointer cast (host_ptr → DataHeader*) * static DataHeader* header_from_shm(void* shared_mem_host); @@ -117,7 +117,7 @@ * (use the subsystem's PLATFORM_*_TIMEOUT_SECONDS). * * static constexpr const char* kSubsystemName; - * Used in the idle-timeout log line (e.g. "L2Perf", "PMU", "TensorDump"). + * Used in the idle-timeout log line (e.g. "L2Swimlane", "PMU", "TensorDump"). */ #ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_PROFILING_COMMON_PROFILER_BASE_H_ @@ -138,7 +138,7 @@ namespace profiling_common { // Common subsystem callback signatures. All four collectors (PMU / TensorDump -// / L2Perf / DepGen) used to declare their own typedefs with identical +// / L2Swimlane / DepGen) used to declare their own typedefs with identical // shapes; these are the canonical types stashed in ProfilerBase via // set_memory_context(). // diff --git a/src/a2a3/platform/onboard/aicore/kernel.cpp b/src/a2a3/platform/onboard/aicore/kernel.cpp index e87cb81ef..5f504f54d 100644 --- a/src/a2a3/platform/onboard/aicore/kernel.cpp +++ b/src/a2a3/platform/onboard/aicore/kernel.cpp @@ -15,7 +15,7 @@ #include "aicore/aicore_profiling_state.h" #include "common/core_type.h" #include "common/kernel_args.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #ifdef __DAV_VEC__ #define KERNEL_ENTRY(x) \ @@ -45,25 +45,26 @@ [[block_local]] static uint32_t s_aicore_profiling_flag; // Slot pointer (NOT the dereferenced rotation address) — see // aicore_profiling_state.h for the lazy-deref contract. -[[block_local]] static __gm__ uint64_t *s_aicore_rotation_slot; -[[block_local]] static __gm__ AicoreRotation *s_aicore_rotation; +[[block_local]] static __gm__ uint64_t *s_l2_swimlane_aicore_rotation_slot; +[[block_local]] static __gm__ L2SwimlaneAicoreRotation *s_l2_swimlane_aicore_rotation; __attribute__((weak)) __aicore__ void set_aicore_profiling_flag(uint32_t flag) { s_aicore_profiling_flag = flag; } __attribute__((weak)) __aicore__ uint32_t get_aicore_profiling_flag() { return s_aicore_profiling_flag; } -__attribute__((weak)) __aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) { - s_aicore_rotation_slot = slot_ptr; - s_aicore_rotation = nullptr; // force lazy resolution on next get +__attribute__((weak)) __aicore__ void set_l2_swimlane_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) { + s_l2_swimlane_aicore_rotation_slot = slot_ptr; + s_l2_swimlane_aicore_rotation = nullptr; // force lazy resolution on next get } -__attribute__((weak)) __aicore__ __gm__ AicoreRotation *get_aicore_rotation() { - // Lazy first-call resolve: AICPU init populates `*s_aicore_rotation_slot` +__attribute__((weak)) __aicore__ __gm__ L2SwimlaneAicoreRotation *get_l2_swimlane_aicore_rotation() { + // Lazy first-call resolve: AICPU init populates `*s_l2_swimlane_aicore_rotation_slot` // before dispatching the first task, so by the time the executor reaches // for the rotation (inside the first-task branch of the dispatch poll) // the slot holds a valid device address. - if (s_aicore_rotation == nullptr && s_aicore_rotation_slot != nullptr) { - s_aicore_rotation = reinterpret_cast<__gm__ AicoreRotation *>(*s_aicore_rotation_slot); + if (s_l2_swimlane_aicore_rotation == nullptr && s_l2_swimlane_aicore_rotation_slot != nullptr) { + s_l2_swimlane_aicore_rotation = + reinterpret_cast<__gm__ L2SwimlaneAicoreRotation *>(*s_l2_swimlane_aicore_rotation_slot); } - return s_aicore_rotation; + return s_l2_swimlane_aicore_rotation; } extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type); @@ -101,17 +102,18 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ KernelA // Publish per-core profiling state into platform-owned slots before the // executor runs. AICore reads via get_aicore_profiling_flag() / - // get_aicore_rotation() — never touches Handshake for profiling. + // get_l2_swimlane_aicore_rotation() — never touches Handshake for profiling. set_aicore_profiling_flag(k_args->enable_profiling_flag); if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)) { // Stash only the slot pointer. The slot CONTENTS are written by - // AICPU's `l2_perf_aicpu_init` which runs concurrently with this + // AICPU's `l2_swimlane_aicpu_init` which runs concurrently with this // entry; dereferencing here would race with AICPU's write. The - // executor defers the deref via `get_aicore_rotation()` until inside + // executor defers the deref via `get_l2_swimlane_aicore_rotation()` until inside // the first-task branch — by then AICPU has dispatched, so init is // done and the slot is populated. - __gm__ uint64_t *rotation_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_ring_addr); - set_aicore_rotation_slot(rotation_table != nullptr ? &rotation_table[block_idx] : nullptr); + __gm__ uint64_t *rotation_table = + reinterpret_cast<__gm__ uint64_t *>(k_args->l2_swimlane_aicore_rotation_table); + set_l2_swimlane_aicore_rotation_slot(rotation_table != nullptr ? &rotation_table[block_idx] : nullptr); } aicore_execute(k_args->runtime_args, block_idx, core_type); diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index 7926fa0e7..2a5d99053 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -16,7 +16,7 @@ #include "aicpu/dep_gen_collector_aicpu.h" #include "aicpu/device_log.h" #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/pmu_collector_aicpu.h" #include "aicpu/platform_regs.h" #include "aicpu/platform_aicpu_affinity.h" @@ -109,8 +109,8 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a set_orch_device_id(static_cast(k_args->device_id)); set_platform_dump_base(k_args->dump_data_base); set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR)); - set_platform_l2_perf_base(k_args->l2_perf_data_base); - set_platform_aicore_rotation_table(k_args->aicore_ring_addr); + set_platform_l2_swimlane_base(k_args->l2_swimlane_data_base); + set_platform_l2_swimlane_aicore_rotation_table(k_args->l2_swimlane_aicore_rotation_table); set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)); set_platform_pmu_base(k_args->pmu_data_base); set_platform_pmu_reg_addrs(k_args->pmu_reg_addrs); diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index 65b238589..78d8f4097 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -56,7 +56,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_swimlane_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/comm_hccl.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index dfa0b9d63..9a01c133a 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -253,9 +253,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { // Initialize per-subsystem shared memory. if (enable_l2_swimlane_) { - rc = init_l2_perf(num_aicore, device_id_); + rc = init_l2_swimlane(num_aicore, device_id_); if (rc != 0) { - LOG_ERROR("init_l2_perf failed: %d", rc); + LOG_ERROR("init_l2_swimlane failed: %d", rc); return rc; } } @@ -296,7 +296,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { // On any exit from run() — success or early error — release the diagnostics // collectors' shared memory. They are only re-initialized per run(), so a // Worker reused across runs (e.g. a pytest session-scoped worker pool) would - // otherwise re-enter init_l2_perf() with stale state still allocated. + // otherwise re-enter init_l2_swimlane() with stale state still allocated. auto perf_cleanup = RAIIScopeGuard([this]() { finalize_collectors(); }); @@ -457,7 +457,7 @@ int DeviceRunner::finalize() { // `launch_aicpu_kernel` and `launch_aicore_kernel` live on `DeviceRunnerBase`. -int DeviceRunner::init_l2_perf(int num_aicore, int device_id) { +int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) { auto alloc_cb = [this](size_t size) -> void * { return mem_alloc_.alloc(size); }; @@ -479,16 +479,17 @@ int DeviceRunner::init_l2_perf(int num_aicore, int device_id) { return mem_alloc_.free(dev_ptr); }; - int rc = l2_perf_collector_.initialize( - num_aicore, device_id, l2_perf_level_, alloc_cb, register_cb, free_cb, output_prefix_ + int rc = l2_swimlane_collector_.initialize( + num_aicore, device_id, l2_swimlane_level_, alloc_cb, register_cb, free_cb, output_prefix_ ); if (rc != 0) { return rc; } - kernel_args_.args.l2_perf_data_base = reinterpret_cast(l2_perf_collector_.get_l2_perf_setup_device_ptr()); - kernel_args_.args.aicore_ring_addr = - reinterpret_cast(l2_perf_collector_.get_aicore_ring_addr_table_device_ptr()); + kernel_args_.args.l2_swimlane_data_base = + reinterpret_cast(l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr()); + kernel_args_.args.l2_swimlane_aicore_rotation_table = + reinterpret_cast(l2_swimlane_collector_.get_aicore_ring_addr_table_device_ptr()); return 0; } @@ -634,8 +635,8 @@ void DeviceRunner::finalize_collectors() { return mem_alloc_.free(dev_ptr); }; - if (l2_perf_collector_.is_initialized()) { - l2_perf_collector_.finalize(unregister_cb, free_cb); + if (l2_swimlane_collector_.is_initialized()) { + l2_swimlane_collector_.finalize(unregister_cb, free_cb); } if (dump_collector_.is_initialized()) { dump_collector_.finalize(unregister_cb, free_cb); diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index ea2358a72..9a5b65b3f 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -40,7 +40,7 @@ #include "prepare_callable_common.h" #include "common/kernel_args.h" #include "common/memory_barrier.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "common/unified_log.h" #include "device_arena.h" @@ -48,7 +48,7 @@ #include "device_runner_helpers.h" // common DeviceArgs + KernelArgsHelper #include "host/function_cache.h" #include "host/memory_allocator.h" -#include "host/l2_perf_collector.h" +#include "host/l2_swimlane_collector.h" #include "host/tensor_dump_collector.h" #include "host/pmu_collector.h" #include "host/dep_gen_collector.h" @@ -195,7 +195,7 @@ class DeviceRunner : public DeviceRunnerBase { // acl_ready_, so runtimes that never ask for ACL (e.g. pure rt-layer) stay unaffected. bool acl_ready_{false}; - // Shared collectors (`l2_perf_collector_`, `dump_collector_`, + // Shared collectors (`l2_swimlane_collector_`, `dump_collector_`, // `pmu_collector_`, `scope_stats_collector_`) live on `DeviceRunnerBase`. // // dep_gen collector — captures orchestrator submit_task inputs for @@ -217,7 +217,7 @@ class DeviceRunner : public DeviceRunnerBase { * @param device_id Device ID for host registration * @return 0 on success, error code on failure */ - int init_l2_perf(int num_aicore, int device_id); + int init_l2_swimlane(int num_aicore, int device_id); /** * Initialize tensor dump shared memory and collector. @@ -274,7 +274,7 @@ class DeviceRunner : public DeviceRunnerBase { */ void finalize_collectors(); // Shared enable flags (`enable_l2_swimlane_`, `enable_dump_tensor_`, - // `enable_pmu_`, `enable_scope_stats_`, `l2_perf_level_`, + // `enable_pmu_`, `enable_scope_stats_`, `l2_swimlane_level_`, // `pmu_event_type_`, `output_prefix_`) live on `DeviceRunnerBase`. // // dep_gen enablement is a2a3-only. diff --git a/src/a2a3/platform/sim/aicore/inner_kernel.h b/src/a2a3/platform/sim/aicore/inner_kernel.h index 4f41e10a6..b92e2e279 100644 --- a/src/a2a3/platform/sim/aicore/inner_kernel.h +++ b/src/a2a3/platform/sim/aicore/inner_kernel.h @@ -38,12 +38,12 @@ // - with CACHELINE_OUT: write-back/flush (write to memory) -> release semantics // On aarch64, acquire-only fences do NOT prevent store-store reordering across the // barrier, so using acquire for the flush direction causes a race: the AICPU can -// observe the COND register FIN signal before l2_perf_buf->count is visible. +// observe the COND register FIN signal before l2_swimlane_buf->count is visible. // Using seq_cst (dmb ish / full barrier) covers both directions safely. // Use variadic macro to support both 2-arg and 3-arg calls. #define dcci(...) std::atomic_thread_fence(std::memory_order_seq_cst) -// dsb / mem_dsb_t — CANN provides these on real AICore; l2_perf_collector uses them after dcci flush. +// dsb / mem_dsb_t — CANN provides these on real AICore; l2_swimlane_collector uses them after dcci flush. // Simulation: full fence (same strength as dcci above) so AICPU ordering matches hardware intent. typedef int mem_dsb_t; #define dsb(_kind) \ diff --git a/src/a2a3/platform/sim/aicore/kernel.cpp b/src/a2a3/platform/sim/aicore/kernel.cpp index 033682d2b..4607f1526 100644 --- a/src/a2a3/platform/sim/aicore/kernel.cpp +++ b/src/a2a3/platform/sim/aicore/kernel.cpp @@ -23,7 +23,7 @@ #include "aicore/aicore.h" #include "aicore/aicore_profiling_state.h" #include "common/core_type.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "runtime.h" @@ -35,16 +35,16 @@ static pthread_key_t g_core_id_key; static pthread_key_t g_aicore_profiling_flag_key; // Slot pointer (NOT the dereferenced rotation address) — see // aicore_profiling_state.h for the lazy-deref contract. -static pthread_key_t g_aicore_rotation_slot_key; -static pthread_key_t g_aicore_rotation_key; +static pthread_key_t g_l2_swimlane_aicore_rotation_slot_key; +static pthread_key_t g_l2_swimlane_aicore_rotation_key; static pthread_once_t g_tls_once = PTHREAD_ONCE_INIT; static void create_tls_keys() { pthread_key_create(&g_reg_base_key, nullptr); pthread_key_create(&g_core_id_key, nullptr); pthread_key_create(&g_aicore_profiling_flag_key, nullptr); - pthread_key_create(&g_aicore_rotation_slot_key, nullptr); - pthread_key_create(&g_aicore_rotation_key, nullptr); + pthread_key_create(&g_l2_swimlane_aicore_rotation_slot_key, nullptr); + pthread_key_create(&g_l2_swimlane_aicore_rotation_key, nullptr); } volatile uint8_t *sim_get_reg_base() { return static_cast(pthread_getspecific(g_reg_base_key)); } @@ -65,18 +65,19 @@ __aicore__ uint32_t get_aicore_profiling_flag() { return static_cast(reinterpret_cast(pthread_getspecific(g_aicore_profiling_flag_key))); } -__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) { - pthread_setspecific(g_aicore_rotation_slot_key, reinterpret_cast(slot_ptr)); - pthread_setspecific(g_aicore_rotation_key, nullptr); // force lazy resolve on next get +__aicore__ void set_l2_swimlane_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) { + pthread_setspecific(g_l2_swimlane_aicore_rotation_slot_key, reinterpret_cast(slot_ptr)); + pthread_setspecific(g_l2_swimlane_aicore_rotation_key, nullptr); // force lazy resolve on next get } -__aicore__ __gm__ AicoreRotation *get_aicore_rotation() { - auto *cached = reinterpret_cast<__gm__ AicoreRotation *>(pthread_getspecific(g_aicore_rotation_key)); +__aicore__ __gm__ L2SwimlaneAicoreRotation *get_l2_swimlane_aicore_rotation() { + auto *cached = + reinterpret_cast<__gm__ L2SwimlaneAicoreRotation *>(pthread_getspecific(g_l2_swimlane_aicore_rotation_key)); if (cached != nullptr) return cached; - auto *slot = reinterpret_cast<__gm__ uint64_t *>(pthread_getspecific(g_aicore_rotation_slot_key)); + auto *slot = reinterpret_cast<__gm__ uint64_t *>(pthread_getspecific(g_l2_swimlane_aicore_rotation_slot_key)); if (slot == nullptr) return nullptr; // Lazy first-call resolve — see aicore_profiling_state.h. - cached = reinterpret_cast<__gm__ AicoreRotation *>(*slot); - pthread_setspecific(g_aicore_rotation_key, reinterpret_cast(cached)); + cached = reinterpret_cast<__gm__ L2SwimlaneAicoreRotation *>(*slot); + pthread_setspecific(g_l2_swimlane_aicore_rotation_key, reinterpret_cast(cached)); return cached; } @@ -102,7 +103,7 @@ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type); // executor with its original signature. extern "C" void aicore_execute_wrapper( __gm__ Runtime *runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs, - uint32_t enable_profiling_flag, uint64_t aicore_ring_addr + uint32_t enable_profiling_flag, uint64_t l2_swimlane_aicore_rotation_table ) { pthread_once(&g_tls_once, create_tls_keys); @@ -118,14 +119,14 @@ extern "C" void aicore_execute_wrapper( // Publish per-core profiling state before the executor runs. set_aicore_profiling_flag(enable_profiling_flag); - if (aicore_ring_addr != 0) { + if (l2_swimlane_aicore_rotation_table != 0) { // Stash only the slot pointer; deref happens lazily inside - // get_aicore_rotation() once AICPU has populated the table. See + // get_l2_swimlane_aicore_rotation() once AICPU has populated the table. See // aicore_profiling_state.h. - uint64_t *rotation_table = reinterpret_cast(aicore_ring_addr); - set_aicore_rotation_slot(reinterpret_cast<__gm__ uint64_t *>(&rotation_table[block_idx])); + uint64_t *rotation_table = reinterpret_cast(l2_swimlane_aicore_rotation_table); + set_l2_swimlane_aicore_rotation_slot(reinterpret_cast<__gm__ uint64_t *>(&rotation_table[block_idx])); } else { - set_aicore_rotation_slot(nullptr); + set_l2_swimlane_aicore_rotation_slot(nullptr); } // Set core identity for pto-isa TPUSH/TPOP simulation. diff --git a/src/a2a3/platform/sim/host/CMakeLists.txt b/src/a2a3/platform/sim/host/CMakeLists.txt index e32931c81..f55a68ce4 100644 --- a/src/a2a3/platform/sim/host/CMakeLists.txt +++ b/src/a2a3/platform/sim/host/CMakeLists.txt @@ -43,7 +43,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_swimlane_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/dep_gen_collector.cpp" diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index a6a78bdc2..f350cec22 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -63,7 +63,7 @@ extern "C" __attribute__((weak, visibility("hidden"))) int dep_gen_replay_emit_d typedef int (*aicpu_execute_func_t)(Runtime *runtime); typedef void (*aicore_execute_func_t)( Runtime *runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs, - uint32_t enable_profiling_flag, uint64_t aicore_ring_addr + uint32_t enable_profiling_flag, uint64_t l2_swimlane_aicore_rotation_table ); typedef void (*set_platform_regs_func_t)(uint64_t regs); typedef void (*set_platform_dump_base_func_t)(uint64_t dump_data_base); @@ -266,17 +266,18 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } - set_platform_l2_perf_base_func_ = - reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_l2_perf_base")); - if (set_platform_l2_perf_base_func_ == nullptr) { - LOG_ERROR("dlsym failed for set_platform_l2_perf_base: %s", dlerror()); + set_platform_l2_swimlane_base_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_l2_swimlane_base")); + if (set_platform_l2_swimlane_base_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_platform_l2_swimlane_base: %s", dlerror()); return -1; } - set_platform_aicore_rotation_table_func_ = - reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_aicore_rotation_table")); + set_platform_aicore_rotation_table_func_ = reinterpret_cast( + dlsym(aicpu_so_handle_, "set_platform_l2_swimlane_aicore_rotation_table") + ); if (set_platform_aicore_rotation_table_func_ == nullptr) { - LOG_ERROR("dlsym failed for set_platform_aicore_rotation_table: %s", dlerror()); + LOG_ERROR("dlsym failed for set_platform_l2_swimlane_aicore_rotation_table: %s", dlerror()); return -1; } @@ -523,9 +524,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { // Initialize per-subsystem shared memory. if (enable_l2_swimlane_) { - rc = init_l2_perf(num_aicore, device_id_); + rc = init_l2_swimlane(num_aicore, device_id_); if (rc != 0) { - LOG_ERROR("init_l2_perf failed: %d", rc); + LOG_ERROR("init_l2_swimlane failed: %d", rc); return rc; } } @@ -566,7 +567,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { // On any exit from run() — success or early error — release the diagnostics // collectors' shared memory. They are only re-initialized per run(), so a // Worker reused across runs (e.g. a pytest session-scoped worker pool) would - // otherwise re-enter init_l2_perf() with stale state still allocated. + // otherwise re-enter init_l2_swimlane() with stale state still allocated. auto perf_cleanup = RAIIScopeGuard([this]() { finalize_collectors(); }); @@ -658,8 +659,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { set_platform_regs_func_(kernel_args_.regs); set_platform_dump_base_func_(kernel_args_.dump_data_base); set_dump_tensor_enabled_func_(enable_dump_tensor_); - set_platform_l2_perf_base_func_(kernel_args_.l2_perf_data_base); - set_platform_aicore_rotation_table_func_(kernel_args_.aicore_ring_addr); + set_platform_l2_swimlane_base_func_(kernel_args_.l2_swimlane_data_base); + set_platform_aicore_rotation_table_func_(kernel_args_.l2_swimlane_aicore_rotation_table); set_l2_swimlane_enabled_func_(enable_l2_swimlane_); set_platform_pmu_base_func_(kernel_args_.pmu_data_base); set_platform_pmu_reg_addrs_func_(kernel_args_.pmu_reg_addrs); @@ -680,7 +681,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return create_thread(std::move(fn)); }; if (enable_l2_swimlane_) { - l2_perf_collector_.start(thread_factory); + l2_swimlane_collector_.start(thread_factory); } if (enable_dump_tensor_) { dump_collector_.start(thread_factory); @@ -740,7 +741,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { aicore_threads.push_back(create_thread([this, &runtime, i, core_type, physical_core_id]() { aicore_execute_func_( &runtime, i, core_type, physical_core_id, kernel_args_.regs, kernel_args_.enable_profiling_flag, - kernel_args_.aicore_ring_addr + kernel_args_.l2_swimlane_aicore_rotation_table ); })); } @@ -774,10 +775,10 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { // Diagnostic exports use the per-task `output_prefix_` directory the user // set on CallConfig (CallConfig::validate() enforces non-empty upstream). if (enable_l2_swimlane_) { - l2_perf_collector_.stop(); - l2_perf_collector_.read_phase_header_metadata(); - l2_perf_collector_.reconcile_counters(); - l2_perf_collector_.export_swimlane_json(); + l2_swimlane_collector_.stop(); + l2_swimlane_collector_.read_phase_header_metadata(); + l2_swimlane_collector_.reconcile_counters(); + l2_swimlane_collector_.export_swimlane_json(); } if (enable_dump_tensor_) { @@ -851,7 +852,7 @@ void DeviceRunner::unload_executor_binaries() { set_platform_regs_func_ = nullptr; set_platform_dump_base_func_ = nullptr; set_dump_tensor_enabled_func_ = nullptr; - set_platform_l2_perf_base_func_ = nullptr; + set_platform_l2_swimlane_base_func_ = nullptr; set_platform_aicore_rotation_table_func_ = nullptr; set_l2_swimlane_enabled_func_ = nullptr; set_platform_pmu_base_func_ = nullptr; @@ -1225,7 +1226,7 @@ uint64_t DeviceRunner::upload_chip_callable_buffer(const ChipCallable *callable) // Performance Profiling Implementation // ============================================================================= -int DeviceRunner::init_l2_perf(int num_aicore, int device_id) { +int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) { auto alloc_cb = [this](size_t size) -> void * { return mem_alloc_.alloc(size); }; @@ -1234,16 +1235,17 @@ int DeviceRunner::init_l2_perf(int num_aicore, int device_id) { }; // Simulation: dev pointer is directly host-accessible, no register pass-through. - int rc = l2_perf_collector_.initialize( - num_aicore, device_id, l2_perf_level_, alloc_cb, nullptr, free_cb, output_prefix_ + int rc = l2_swimlane_collector_.initialize( + num_aicore, device_id, l2_swimlane_level_, alloc_cb, nullptr, free_cb, output_prefix_ ); if (rc != 0) { return rc; } - kernel_args_.l2_perf_data_base = reinterpret_cast(l2_perf_collector_.get_l2_perf_setup_device_ptr()); - kernel_args_.aicore_ring_addr = - reinterpret_cast(l2_perf_collector_.get_aicore_ring_addr_table_device_ptr()); + kernel_args_.l2_swimlane_data_base = + reinterpret_cast(l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr()); + kernel_args_.l2_swimlane_aicore_rotation_table = + reinterpret_cast(l2_swimlane_collector_.get_aicore_ring_addr_table_device_ptr()); return 0; } @@ -1331,8 +1333,8 @@ void DeviceRunner::finalize_collectors() { return mem_alloc_.free(dev_ptr); }; - if (l2_perf_collector_.is_initialized()) { - l2_perf_collector_.finalize(nullptr, free_cb); + if (l2_swimlane_collector_.is_initialized()) { + l2_swimlane_collector_.finalize(nullptr, free_cb); } if (dump_collector_.is_initialized()) { dump_collector_.finalize(nullptr, free_cb); diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index f1a44e59b..a25f3cd30 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -49,12 +49,12 @@ #include "common/core_type.h" #include "common/kernel_args.h" #include "common/memory_barrier.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "common/unified_log.h" #include "host/function_cache.h" #include "host/memory_allocator.h" -#include "host/l2_perf_collector.h" +#include "host/l2_swimlane_collector.h" #include "host/tensor_dump_collector.h" #include "host/pmu_collector.h" #include "host/dep_gen_collector.h" @@ -187,8 +187,8 @@ class DeviceRunner { * Runtime struct / run() arg list so all three travel the same way. */ void set_l2_swimlane_enabled(int level) { - l2_perf_level_ = static_cast(level); - enable_l2_swimlane_ = (l2_perf_level_ != L2PerfLevel::DISABLED); + l2_swimlane_level_ = static_cast(level); + enable_l2_swimlane_ = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED); } void set_dump_tensor_enabled(bool enable) { enable_dump_tensor_ = enable; } void set_pmu_enabled(int enable_pmu) { @@ -197,7 +197,7 @@ class DeviceRunner { } void set_dep_gen_enabled(bool enable) { enable_dep_gen_ = enable; } void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; } - // Directory under which all diagnostic artifacts (l2_perf_records.json / + // Directory under which all diagnostic artifacts (l2_swimlane_records.json / // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic // is enabled; CallConfig::validate() enforces this contract upstream. void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; } @@ -381,7 +381,7 @@ class DeviceRunner { void (*set_platform_regs_func_)(uint64_t){nullptr}; void (*set_platform_dump_base_func_)(uint64_t){nullptr}; void (*set_dump_tensor_enabled_func_)(bool){nullptr}; - void (*set_platform_l2_perf_base_func_)(uint64_t){nullptr}; + void (*set_platform_l2_swimlane_base_func_)(uint64_t){nullptr}; void (*set_platform_aicore_rotation_table_func_)(uint64_t){nullptr}; void (*set_l2_swimlane_enabled_func_)(bool){nullptr}; void (*set_platform_pmu_base_func_)(uint64_t){nullptr}; @@ -395,7 +395,7 @@ class DeviceRunner { std::string aicore_so_path_; // Performance profiling - L2PerfCollector l2_perf_collector_; + L2SwimlaneCollector l2_swimlane_collector_; // Tensor dump (independent shared memory + memory manager) TensorDumpCollector dump_collector_; @@ -429,7 +429,7 @@ class DeviceRunner { * @param device_id Device ID (ignored in simulation) * @return 0 on success, error code on failure */ - int init_l2_perf(int num_aicore, int device_id); + int init_l2_swimlane(int num_aicore, int device_id); int init_tensor_dump(Runtime &runtime, int device_id); @@ -456,9 +456,9 @@ class DeviceRunner { bool enable_pmu_{false}; bool enable_dep_gen_{false}; bool enable_scope_stats_{false}; - L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() - PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() - std::string output_prefix_{}; // diagnostic artifact root directory + L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() + PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() + std::string output_prefix_{}; // diagnostic artifact root directory }; #endif // SRC_A2A3_PLATFORM_SIM_HOST_DEVICE_RUNNER_H_ diff --git a/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp similarity index 68% rename from src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp rename to src/a2a3/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp index 7ab5b7498..af44aced2 100644 --- a/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp +++ b/src/a2a3/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp @@ -10,15 +10,15 @@ */ /** - * @file l2_perf_collector_aicpu.cpp + * @file l2_swimlane_collector_aicpu.cpp * @brief AICPU performance data collection implementation (SPSC free queue) * - * Uses per-core L2PerfBufferState with SPSC free queues for O(1) buffer switching. + * Uses per-core L2SwimlaneAicpuTaskPool with SPSC free queues for O(1) buffer switching. * Host memory manager dynamically allocates replacement buffers and pushes * them into the free_queue. Device pops from free_queue when switching. */ -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include #include @@ -29,79 +29,83 @@ #include "common/unified_log.h" // Cached pointers for hot-path access (set during init) -static AicpuPhaseHeader *s_phase_header = nullptr; -static L2PerfDataHeader *s_l2_perf_header = nullptr; +static L2SwimlaneAicpuPhaseHeader *s_l2_swimlane_aicpu_phase_header = nullptr; +static L2SwimlaneDataHeader *s_l2_swimlane_header = nullptr; -// Per-core L2PerfBufferState cache -static L2PerfBufferState *s_perf_buffer_states[PLATFORM_MAX_CORES] = {}; +// Per-core L2SwimlaneAicpuTaskPool cache +static L2SwimlaneAicpuTaskPool *s_aicpu_task_pools[PLATFORM_MAX_CORES] = {}; -// Per-core L2PerfAicoreBufferState cache (lives in the same shared region; +// Per-core L2SwimlaneAicoreTaskPool cache (lives in the same shared region; // host writes initial pool + the rotation channel that AICore polls). // // All AICore-side bookkeeping (rotation channel, free queue, // total_record_count, current_buf_seq) is owned by this shared struct — see -// l2_perf_profiling.h. We deliberately do not keep AICPU-process-local +// l2_swimlane_profiling.h. We deliberately do not keep AICPU-process-local // mirror counters because the struct's volatile fields are the single // source of truth across init/complete/rotate/flush. The high-water-mark // formula `total_record_count - current_buf_seq * BUFFER_SIZE` correctly // handles the failed-rotation case (free_queue empty or ready_queue full) // since current_buf_seq only bumps on a successful rotation. -static L2PerfAicoreBufferState *s_aicore_buffer_states[PLATFORM_MAX_CORES] = {}; +static L2SwimlaneAicoreTaskPool *s_aicore_task_pools[PLATFORM_MAX_CORES] = {}; // Per-core cached current-records-buffer pointer. Written by AICPU when // rotating buffers from inside `complete_record`. AICore writes to its own -// per-core L2PerfAicoreBuffer (host-allocated, AICPU rotates) and AICPU +// per-core L2SwimlaneAicoreTaskBuffer (host-allocated, AICPU rotates) and AICPU // never reads from it on the hot path. -static L2PerfBuffer *s_perf_records_buffers[PLATFORM_MAX_CORES] = {}; +static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORES] = {}; -// Per-thread PhaseBufferState cache -static PhaseBufferState *s_phase_buffer_states[PLATFORM_MAX_AICPU_THREADS] = {}; -static PhaseBuffer *s_current_phase_buf[PLATFORM_MAX_AICPU_THREADS] = {}; +// Per-thread L2SwimlaneAicpuPhasePool cache +static L2SwimlaneAicpuPhasePool *s_aicpu_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; +static L2SwimlaneAicpuPhaseBuffer *s_current_aicpu_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; static int s_orch_thread_idx = -1; -// L2 perf platform state. Published by the host (via dlsym'd setters on sim) +// L2 swimlane platform state. Published by the host (via dlsym'd setters on sim) // or by the AICPU kernel entry (onboard) before perf init runs, so downstream // perf code can discover enablement + device-base without reading the generic // Runtime struct. Two channels (mirrors PMU): // - g_enable_l2_swimlane (bool) — set at kernel entry from the bitmask bit -// - g_l2_perf_level (L2PerfLevel) — promoted in -// l2_perf_aicpu_init from the shared-memory header so +// - g_l2_swimlane_level (L2SwimlaneLevel) — promoted in +// l2_swimlane_aicpu_init from the shared-memory header so // `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates have the granular -// value (exposed via get_l2_perf_level()). -static uint64_t g_platform_l2_perf_base = 0; +// value (exposed via get_l2_swimlane_level()). +static uint64_t g_platform_l2_swimlane_base = 0; static bool g_enable_l2_swimlane = false; -static L2PerfLevel g_l2_perf_level = L2PerfLevel::DISABLED; +static L2SwimlaneLevel g_l2_swimlane_level = L2SwimlaneLevel::DISABLED; -// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr). +// AICore rotation-table device pointer (= KernelArgs::l2_swimlane_aicore_rotation_table). // Published by the host (sim: dlsym'd setter; onboard: from k_args via the // kernel entry); AICPU init walks it to fill per-core &rotation addresses. -static uint64_t g_platform_aicore_rotation_table = 0; +static uint64_t g_platform_l2_swimlane_aicore_rotation_table = 0; -extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base) { g_platform_l2_perf_base = l2_perf_data_base; } -extern "C" uint64_t get_platform_l2_perf_base() { return g_platform_l2_perf_base; } +extern "C" void set_platform_l2_swimlane_base(uint64_t l2_swimlane_data_base) { + g_platform_l2_swimlane_base = l2_swimlane_data_base; +} +extern "C" uint64_t get_platform_l2_swimlane_base() { return g_platform_l2_swimlane_base; } extern "C" void set_l2_swimlane_enabled(bool enable) { g_enable_l2_swimlane = enable; } extern "C" bool is_l2_swimlane_enabled() { return g_enable_l2_swimlane; } -extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr) { - g_platform_aicore_rotation_table = table_addr; +extern "C" void set_platform_l2_swimlane_aicore_rotation_table(uint64_t table_addr) { + g_platform_l2_swimlane_aicore_rotation_table = table_addr; +} +extern "C" uint64_t get_platform_l2_swimlane_aicore_rotation_table() { + return g_platform_l2_swimlane_aicore_rotation_table; } -extern "C" uint64_t get_platform_aicore_rotation_table() { return g_platform_aicore_rotation_table; } -L2PerfLevel get_l2_perf_level() { return g_l2_perf_level; } +L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; } /** * Enqueue ready buffer to per-thread queue * - * @param header L2PerfDataHeader pointer + * @param header L2SwimlaneDataHeader pointer * @param thread_idx Thread index * @param core_index Core index (or thread_idx for phase entries) * @param buffer_ptr Device pointer to the full buffer * @param buffer_seq Sequence number for ordering - * @param is_phase 0 = L2PerfRecord, 1 = Phase + * @param kind Buffer kind discriminator (see L2SwimlaneBufferKind) * @return 0 on success, -1 if queue full */ static int enqueue_ready_buffer( - L2PerfDataHeader *header, int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq, - uint32_t is_phase + L2SwimlaneDataHeader *header, int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq, + L2SwimlaneBufferKind kind ) { uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE; uint32_t current_tail = header->queue_tails[thread_idx]; @@ -114,7 +118,7 @@ static int enqueue_ready_buffer( } header->queues[thread_idx][current_tail].core_index = core_index; - header->queues[thread_idx][current_tail].is_phase = is_phase; + header->queues[thread_idx][current_tail].kind = kind; header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr; header->queues[thread_idx][current_tail].buffer_seq = buffer_seq; header->queue_tails[thread_idx] = next_tail; @@ -122,42 +126,42 @@ static int enqueue_ready_buffer( return 0; } -void l2_perf_aicpu_init(int worker_count) { - void *l2_perf_base = reinterpret_cast(g_platform_l2_perf_base); - if (l2_perf_base == nullptr) { - LOG_ERROR("l2_perf_data_base is NULL, cannot initialize profiling"); +void l2_swimlane_aicpu_init(int worker_count) { + void *l2_swimlane_base = reinterpret_cast(g_platform_l2_swimlane_base); + if (l2_swimlane_base == nullptr) { + LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize profiling"); return; } - s_l2_perf_header = get_l2_perf_header(l2_perf_base); + s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base); // Read the granular perf_level from the shared-memory header (host wrote - // it in L2PerfCollector::initialize). The kernel-entry setter only seeded + // it in L2SwimlaneCollector::initialize). The kernel-entry setter only seeded // the binary g_enable_l2_swimlane via the bitmask bit. - g_l2_perf_level = static_cast(s_l2_perf_header->l2_perf_level); + g_l2_swimlane_level = static_cast(s_l2_swimlane_header->l2_swimlane_level); LOG_INFO_V0( - "Initializing performance profiling for %d cores (free queue), l2_perf_level=%u", worker_count, - static_cast(g_l2_perf_level) + "Initializing performance profiling for %d cores (free queue), l2_swimlane_level=%u", worker_count, + static_cast(g_l2_swimlane_level) ); - // Populate the per-core AicoreRotation device-address table. AICore reads - // `aicore_ring_addr[block_idx]` from KernelArgs to find its rotation + // Populate the per-core L2SwimlaneAicoreRotation device-address table. AICore reads + // `l2_swimlane_aicore_rotation_table[block_idx]` from KernelArgs to find its rotation // channel; the table itself is host-allocated, but the entries are // device-internal addresses (`&ac_state->rotation`) that the host would // otherwise have to translate from host-mapped to device-mapped. AICPU // already runs on the device, so it can write the addresses directly // without any translation — that keeps the host side decoupled from the // AICore shared-memory layout. - uint64_t *rotation_table = reinterpret_cast(g_platform_aicore_rotation_table); + uint64_t *rotation_table = reinterpret_cast(g_platform_l2_swimlane_aicore_rotation_table); // Pop first buffer from free_queue for each core for (int i = 0; i < worker_count; i++) { - L2PerfBufferState *state = get_perf_buffer_state(l2_perf_base, i); - L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(l2_perf_base, worker_count, i); + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(l2_swimlane_base, i); + L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(l2_swimlane_base, worker_count, i); - s_perf_buffer_states[i] = state; - s_aicore_buffer_states[i] = ac_state; + s_aicpu_task_pools[i] = state; + s_aicore_task_pools[i] = ac_state; if (rotation_table != nullptr) { rotation_table[i] = reinterpret_cast(&ac_state->rotation); @@ -176,15 +180,15 @@ void l2_perf_aicpu_init(int worker_count) { state->current_buf_seq = 0; wmb(); - L2PerfBuffer *buf = reinterpret_cast(buf_ptr); + L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast(buf_ptr); buf->count = 0; - s_perf_records_buffers[i] = buf; + s_current_aicpu_task_buffers[i] = buf; LOG_DEBUG("Core %d: popped initial buffer (addr=0x%lx)", i, buf_ptr); } else { LOG_ERROR("Core %d: free_queue is empty during init!", i); state->current_buf_ptr = 0; - s_perf_records_buffers[i] = nullptr; + s_current_aicpu_task_buffers[i] = nullptr; } // Prime the AICore rotation channel with the initial buffer. @@ -201,7 +205,7 @@ void l2_perf_aicpu_init(int worker_count) { ac_state->rotation.current_buf_ptr = ac_buf_ptr; ac_state->rotation.generation = 1; wmb(); - L2PerfAicoreBuffer *ac_buf = reinterpret_cast(ac_buf_ptr); + L2SwimlaneAicoreTaskBuffer *ac_buf = reinterpret_cast(ac_buf_ptr); ac_buf->count = 0; LOG_DEBUG("Core %d: primed AICore rotation with buf=0x%lx, gen=1", i, ac_buf_ptr); } else { @@ -218,18 +222,18 @@ void l2_perf_aicpu_init(int worker_count) { } /** - * Internal records-buffer rotation. Called from `l2_perf_aicpu_complete_record` + * Internal records-buffer rotation. Called from `l2_swimlane_aicpu_complete_task` * after a record is committed and the buffer hits capacity. Only swaps an * AICPU-private records pointer — AICore reads from a stable ring and is * unaffected by this call. */ static void switch_records_buffer(int core_id, int thread_idx) { - L2PerfBufferState *state = s_perf_buffer_states[core_id]; + L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id]; if (state == nullptr) { return; } - L2PerfBuffer *full_buf = s_perf_records_buffers[core_id]; + L2SwimlaneAicpuTaskBuffer *full_buf = s_current_aicpu_task_buffers[core_id]; if (full_buf == nullptr) { return; } @@ -252,7 +256,9 @@ static void switch_records_buffer(int core_id, int thread_idx) { // Enqueue full buffer to ReadyQueue uint32_t seq = state->current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, state->current_buf_ptr, seq, 0); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, core_id, state->current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask + ); if (rc != 0) { LOG_ERROR("Thread %d: Core %d failed to enqueue buffer (queue full), data lost!", thread_idx, core_id); // Revert: discard data and keep writing @@ -270,21 +276,21 @@ static void switch_records_buffer(int core_id, int thread_idx) { state->current_buf_seq = seq + 1; wmb(); - L2PerfBuffer *new_buf = reinterpret_cast(new_buf_ptr); + L2SwimlaneAicpuTaskBuffer *new_buf = reinterpret_cast(new_buf_ptr); new_buf->count = 0; - s_perf_records_buffers[core_id] = new_buf; + s_current_aicpu_task_buffers[core_id] = new_buf; LOG_INFO_V0("Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr); } // Try to rotate the AICore buffer for `core_id`. Called from the completion -// path after a successful L2PerfRecord commit so the just-FIN'd task's +// path after a successful L2SwimlaneAicpuTaskRecord commit so the just-FIN'd task's // AICore record is guaranteed to be in the old buffer before we enqueue it. // On success bumps `ac_state->current_buf_seq`; on failure (empty free queue // or full ready queue) the old buffer is abandoned in place, AICore overflows // it from now on, and the drop count grows. static void aicore_rotate(int core_id, int thread_idx) { - L2PerfAicoreBufferState *ac_state = s_aicore_buffer_states[core_id]; + L2SwimlaneAicoreTaskPool *ac_state = s_aicore_task_pools[core_id]; if (ac_state == nullptr) { return; } @@ -312,10 +318,12 @@ static void aicore_rotate(int core_id, int thread_idx) { // Enqueue the just-filled AICore buffer with count = BUFFER_SIZE. if (old_buf_ptr != 0) { - L2PerfAicoreBuffer *old_buf = reinterpret_cast(old_buf_ptr); + L2SwimlaneAicoreTaskBuffer *old_buf = reinterpret_cast(old_buf_ptr); old_buf->count = static_cast(PLATFORM_AICORE_BUFFER_SIZE); wmb(); - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, old_buf_ptr, seq, /*is_phase=*/2); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, core_id, old_buf_ptr, seq, L2SwimlaneBufferKind::AicoreTask + ); if (rc != 0) { LOG_ERROR( "Thread %d: Core %d failed to enqueue AICore buffer (queue full), %d records lost", thread_idx, core_id, @@ -332,7 +340,7 @@ static void aicore_rotate(int core_id, int thread_idx) { rmb(); ac_state->free_queue.head = head + 1; ac_state->current_buf_seq = seq + 1; - L2PerfAicoreBuffer *new_buf = reinterpret_cast(new_buf_ptr); + L2SwimlaneAicoreTaskBuffer *new_buf = reinterpret_cast(new_buf_ptr); new_buf->count = 0; wmb(); // ensure new_buf->count=0 visible before AICore sees new ptr @@ -342,18 +350,18 @@ static void aicore_rotate(int core_id, int thread_idx) { } // Public no-op shim kept so callers compile during the cross-runtime -// transition; the rotation has been moved into l2_perf_aicpu_complete_record +// transition; the rotation has been moved into l2_swimlane_aicpu_complete_task // where it is race-free vs in-flight AICore record writes. -void l2_perf_aicpu_maybe_rotate_aicore(int /*core_id*/, int /*thread_idx*/) {} +void l2_swimlane_aicpu_maybe_rotate_aicore(int /*core_id*/, int /*thread_idx*/) {} -int l2_perf_aicpu_complete_record( +int l2_swimlane_aicpu_complete_task( int core_id, int thread_idx, uint32_t expected_reg_task_id, uint64_t task_id, uint32_t func_id, CoreType core_type, uint64_t dispatch_time, uint64_t finish_time ) { if (core_id < 0 || core_id >= PLATFORM_MAX_CORES) { return -1; } - L2PerfBufferState *state = s_perf_buffer_states[core_id]; + L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id]; if (state == nullptr) { return -1; } @@ -362,14 +370,14 @@ int l2_perf_aicpu_complete_record( // `device_total - (collected + dropped)`. state->total_record_count += 1; - L2PerfBuffer *l2_perf_buf = s_perf_records_buffers[core_id]; - if (l2_perf_buf == nullptr) { + L2SwimlaneAicpuTaskBuffer *l2_swimlane_buf = s_current_aicpu_task_buffers[core_id]; + if (l2_swimlane_buf == nullptr) { // No active records buffer (init ran out of free buffers); count as drop // so host reconciliation stays consistent. state->dropped_record_count += 1; return -1; } - uint32_t count = l2_perf_buf->count; + uint32_t count = l2_swimlane_buf->count; if (count >= PLATFORM_PROF_BUFFER_SIZE) { // Defensive: should not happen because we rotate at end of every commit. state->dropped_record_count += 1; @@ -377,14 +385,14 @@ int l2_perf_aicpu_complete_record( } // AICore-as-producer: AICore writes start/end/task_id directly into its - // own per-core L2PerfAicoreBuffer (indexed by reg_task_id % SIZE). AICPU + // own per-core L2SwimlaneAicoreTaskBuffer (indexed by reg_task_id % SIZE). AICPU // writes only AICPU-owned fields here; start/end stay zero on-device and // are patched by the host when the buffer is consumed. Join key is // `reg_task_id` (monotonic per core), stored alongside the PTO2-encoded // `task_id` so the host can match without a hashmap lookup. This // eliminates the per-task rmb() + staging cache-line read the previous // design required. - L2PerfRecord *record = &l2_perf_buf->records[count]; + L2SwimlaneAicpuTaskRecord *record = &l2_swimlane_buf->records[count]; record->start_time = 0; record->end_time = 0; record->duration = 0; @@ -394,7 +402,7 @@ int l2_perf_aicpu_complete_record( record->core_type = core_type; // AICPU_TIMING and above: dispatch/finish timing. - if (g_l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (g_l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { record->dispatch_time = dispatch_time; record->finish_time = finish_time; } else { @@ -403,10 +411,10 @@ int l2_perf_aicpu_complete_record( } uint32_t new_count = count + 1; - l2_perf_buf->count = new_count; + l2_swimlane_buf->count = new_count; wmb(); - // Rotate AICpu's L2PerfBuffer after the write so the just-committed + // Rotate AICpu's L2SwimlaneAicpuTaskBuffer after the write so the just-committed // record is preserved. if (new_count >= PLATFORM_PROF_BUFFER_SIZE) { switch_records_buffer(core_id, thread_idx); @@ -435,7 +443,7 @@ int l2_perf_aicpu_complete_record( // total_record_count is uint32_t — wraps after ~4 G completions per core. // At realistic dispatch rates this is multi-week continuous-run territory; // we accept the limitation rather than widening the on-device counter. - L2PerfAicoreBufferState *ac_state = s_aicore_buffer_states[core_id]; + L2SwimlaneAicoreTaskPool *ac_state = s_aicore_task_pools[core_id]; if (ac_state != nullptr) { uint32_t completed = ac_state->total_record_count + 1; ac_state->total_record_count = completed; @@ -447,13 +455,13 @@ int l2_perf_aicpu_complete_record( return 0; } -void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, int core_num) { +void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int core_num) { if (!g_enable_l2_swimlane) { return; } - void *l2_perf_base = reinterpret_cast(g_platform_l2_perf_base); - if (l2_perf_base == nullptr) { + void *l2_swimlane_base = reinterpret_cast(g_platform_l2_swimlane_base); + if (l2_swimlane_base == nullptr) { return; } @@ -465,7 +473,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; - L2PerfBufferState *state = s_perf_buffer_states[core_id]; + L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id]; if (state == nullptr) continue; rmb(); @@ -473,15 +481,17 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in if (buf_ptr == 0) { // No active buffer } else { - L2PerfBuffer *buf = reinterpret_cast(buf_ptr); + L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast(buf_ptr); if (buf->count > 0) { uint32_t seq = state->current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, buf_ptr, seq, 0); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, core_id, buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask + ); if (rc == 0) { LOG_INFO_V0("Thread %d: Core %d flushed buffer with %u records", thread_idx, core_id, buf->count); flushed_count++; state->current_buf_ptr = 0; - s_perf_records_buffers[core_id] = nullptr; + s_current_aicpu_task_buffers[core_id] = nullptr; wmb(); } else { // ready_queue full at end-of-run: account the loss and clear the @@ -494,7 +504,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in state->dropped_record_count = state->dropped_record_count + buf->count; buf->count = 0; state->current_buf_ptr = 0; - s_perf_records_buffers[core_id] = nullptr; + s_current_aicpu_task_buffers[core_id] = nullptr; wmb(); } } @@ -510,7 +520,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in // unknown number of dropped overflow attempts; the formula clamps // to BUFFER_SIZE in that case rather than stamping a stale partial // count. - L2PerfAicoreBufferState *ac_state = s_aicore_buffer_states[core_id]; + L2SwimlaneAicoreTaskPool *ac_state = s_aicore_task_pools[core_id]; if (ac_state == nullptr) continue; rmb(); @@ -527,12 +537,14 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in uint32_t ac_mark = (live > static_cast(PLATFORM_AICORE_BUFFER_SIZE)) ? static_cast(PLATFORM_AICORE_BUFFER_SIZE) : live; - L2PerfAicoreBuffer *ac_buf = reinterpret_cast(ac_buf_ptr); + L2SwimlaneAicoreTaskBuffer *ac_buf = reinterpret_cast(ac_buf_ptr); ac_buf->count = ac_mark; wmb(); uint32_t ac_seq = ac_state->current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, ac_buf_ptr, ac_seq, /*is_phase=*/2); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, core_id, ac_buf_ptr, ac_seq, L2SwimlaneBufferKind::AicoreTask + ); if (rc == 0) { LOG_INFO_V0( "Thread %d: Core %d flushed AICore buffer (seq=%u, count=%u)", thread_idx, core_id, ac_seq, ac_mark @@ -552,22 +564,24 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in LOG_INFO_V0("Thread %d: Performance buffer flush complete, %d buffers flushed", thread_idx, flushed_count); } -void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) { - void *l2_perf_base = reinterpret_cast(g_platform_l2_perf_base); - if (l2_perf_base == nullptr) { - LOG_ERROR("l2_perf_data_base is NULL, cannot initialize phase profiling"); +void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads) { + void *l2_swimlane_base = reinterpret_cast(g_platform_l2_swimlane_base); + if (l2_swimlane_base == nullptr) { + LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize phase profiling"); return; } - s_phase_header = get_phase_header(l2_perf_base, worker_count); - s_l2_perf_header = get_l2_perf_header(l2_perf_base); + s_l2_swimlane_aicpu_phase_header = get_phase_header(l2_swimlane_base, worker_count); + s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base); - s_phase_header->magic = AICPU_PHASE_MAGIC; - s_phase_header->num_sched_threads = num_sched_threads; - s_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD; - s_phase_header->num_cores = 0; + s_l2_swimlane_aicpu_phase_header->magic = L2_SWIMLANE_AICPU_PHASE_MAGIC; + s_l2_swimlane_aicpu_phase_header->num_sched_threads = num_sched_threads; + s_l2_swimlane_aicpu_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD; + s_l2_swimlane_aicpu_phase_header->num_cores = 0; - memset(s_phase_header->core_to_thread, -1, sizeof(s_phase_header->core_to_thread)); + memset( + s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread) + ); // Cache per-thread record pointers and clear buffers // Include all threads: scheduler + orchestrator (orchestrators may become schedulers) @@ -576,9 +590,9 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) { total_threads = PLATFORM_MAX_AICPU_THREADS; } for (int t = 0; t < total_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(l2_perf_base, worker_count, t); + L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(l2_swimlane_base, worker_count, t); - s_phase_buffer_states[t] = state; + s_aicpu_phase_pools[t] = state; // Pop first buffer from free_queue rmb(); @@ -593,22 +607,22 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) { state->current_buf_seq = 0; wmb(); - PhaseBuffer *buf = reinterpret_cast(buf_ptr); + L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast(buf_ptr); buf->count = 0; - s_current_phase_buf[t] = buf; + s_current_aicpu_phase_buffers[t] = buf; LOG_DEBUG("Thread %d: popped initial phase buffer (addr=0x%lx)", t, buf_ptr); } else { LOG_ERROR("Thread %d: phase free_queue is empty during init!", t); state->current_buf_ptr = 0; - s_current_phase_buf[t] = nullptr; + s_current_aicpu_phase_buffers[t] = nullptr; } } // Clear remaining slots for (int t = total_threads; t < PLATFORM_MAX_AICPU_THREADS; t++) { - s_phase_buffer_states[t] = nullptr; - s_current_phase_buf[t] = nullptr; + s_aicpu_phase_pools[t] = nullptr; + s_current_aicpu_phase_buffers[t] = nullptr; } wmb(); @@ -623,28 +637,30 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) { * Switch phase buffer when current buffer is full (free queue version) * * Enqueues the full buffer to ReadyQueue and pops the next buffer from free_queue. - * If no free buffer is available, sets s_current_phase_buf to nullptr so subsequent + * If no free buffer is available, sets s_current_aicpu_phase_buffers to nullptr so subsequent * records are dropped (preserving already-enqueued data). */ static void switch_phase_buffer(int thread_idx) { - PhaseBufferState *state = s_phase_buffer_states[thread_idx]; + L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx]; if (state == nullptr) return; - PhaseBuffer *full_buf = s_current_phase_buf[thread_idx]; + L2SwimlaneAicpuPhaseBuffer *full_buf = s_current_aicpu_phase_buffers[thread_idx]; if (full_buf == nullptr) return; LOG_INFO_V0("Thread %d: phase buffer is full (count=%u)", thread_idx, full_buf->count); // Enqueue to ReadyQueue uint32_t seq = state->current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, thread_idx, state->current_buf_ptr, seq, 1); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, thread_idx, state->current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuPhase + ); if (rc != 0) { LOG_ERROR( "Thread %d: failed to enqueue phase buffer (queue full), %u records lost!", thread_idx, full_buf->count ); state->dropped_record_count = state->dropped_record_count + full_buf->count; full_buf->count = 0; - s_current_phase_buf[thread_idx] = nullptr; + s_current_aicpu_phase_buffers[thread_idx] = nullptr; state->current_buf_ptr = 0; wmb(); return; @@ -663,29 +679,29 @@ static void switch_phase_buffer(int thread_idx) { state->current_buf_seq = seq + 1; wmb(); - PhaseBuffer *new_buf = reinterpret_cast(new_buf_ptr); + L2SwimlaneAicpuPhaseBuffer *new_buf = reinterpret_cast(new_buf_ptr); new_buf->count = 0; - s_current_phase_buf[thread_idx] = new_buf; + s_current_aicpu_phase_buffers[thread_idx] = new_buf; LOG_INFO_V0("Thread %d: switched to new phase buffer", thread_idx); } else { // No free buffer available, drop subsequent records LOG_WARN("Thread %d: no free phase buffer available, dropping records until Host catches up", thread_idx); - s_current_phase_buf[thread_idx] = nullptr; + s_current_aicpu_phase_buffers[thread_idx] = nullptr; state->current_buf_ptr = 0; wmb(); } } -void l2_perf_aicpu_record_phase( - int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, +void l2_swimlane_aicpu_record_phase( + int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, uint64_t tasks_processed, uint32_t extra1, uint32_t extra2 ) { - if (s_phase_header == nullptr) { + if (s_l2_swimlane_aicpu_phase_header == nullptr) { return; } - PhaseBufferState *state = s_phase_buffer_states[thread_idx]; + L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx]; if (state == nullptr) { return; } @@ -694,7 +710,7 @@ void l2_perf_aicpu_record_phase( // as `device_total - (collected + dropped)` (mirrors PERF accounting). state->total_record_count += 1; - PhaseBuffer *buf = s_current_phase_buf[thread_idx]; + L2SwimlaneAicpuPhaseBuffer *buf = s_current_aicpu_phase_buffers[thread_idx]; // Try to recover from nullptr (no buffer was available on previous switch) if (buf == nullptr) { @@ -710,9 +726,9 @@ void l2_perf_aicpu_record_phase( state->current_buf_seq = state->current_buf_seq + 1; wmb(); - buf = reinterpret_cast(buf_ptr); + buf = reinterpret_cast(buf_ptr); buf->count = 0; - s_current_phase_buf[thread_idx] = buf; + s_current_aicpu_phase_buffers[thread_idx] = buf; LOG_INFO_V0("Thread %d: recovered phase buffer", thread_idx); } @@ -727,7 +743,7 @@ void l2_perf_aicpu_record_phase( if (idx >= PLATFORM_PHASE_RECORDS_PER_THREAD) { // Buffer full, switch to next buffer switch_phase_buffer(thread_idx); - buf = s_current_phase_buf[thread_idx]; + buf = s_current_aicpu_phase_buffers[thread_idx]; if (buf == nullptr) { state->dropped_record_count += 1; return; @@ -739,7 +755,7 @@ void l2_perf_aicpu_record_phase( } } - AicpuPhaseRecord *record = &buf->records[idx]; + L2SwimlaneAicpuPhaseRecord *record = &buf->records[idx]; record->start_time = start_time; record->end_time = end_time; record->loop_iter = loop_iter; @@ -751,21 +767,21 @@ void l2_perf_aicpu_record_phase( buf->count = idx + 1; } -void l2_perf_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thread_idx; } +void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thread_idx; } -void l2_perf_aicpu_record_orch_phase( - AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id +void l2_swimlane_aicpu_record_orch_phase( + L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id ) { - if (s_orch_thread_idx < 0 || s_phase_header == nullptr) return; - l2_perf_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id); + if (s_orch_thread_idx < 0 || s_l2_swimlane_aicpu_phase_header == nullptr) return; + l2_swimlane_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id); } -void l2_perf_aicpu_flush_phase_buffers(int thread_idx) { - if (s_phase_header == nullptr || s_l2_perf_header == nullptr) { +void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) { + if (s_l2_swimlane_aicpu_phase_header == nullptr || s_l2_swimlane_header == nullptr) { return; } - PhaseBufferState *state = s_phase_buffer_states[thread_idx]; + L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx]; if (state == nullptr) return; rmb(); @@ -775,13 +791,15 @@ void l2_perf_aicpu_flush_phase_buffers(int thread_idx) { return; } - PhaseBuffer *buf = reinterpret_cast(buf_ptr); + L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast(buf_ptr); if (buf->count == 0) { return; } uint32_t seq = state->current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, thread_idx, buf_ptr, seq, 1); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, thread_idx, buf_ptr, seq, L2SwimlaneBufferKind::AicpuPhase + ); if (rc == 0) { LOG_INFO_V0("Thread %d: flushed phase buffer with %u records", thread_idx, buf->count); } else { @@ -790,28 +808,30 @@ void l2_perf_aicpu_flush_phase_buffers(int thread_idx) { buf->count = 0; } state->current_buf_ptr = 0; - s_current_phase_buf[thread_idx] = nullptr; + s_current_aicpu_phase_buffers[thread_idx] = nullptr; wmb(); } -void l2_perf_aicpu_init_core_assignments(int total_cores) { - if (s_phase_header == nullptr) { +void l2_swimlane_aicpu_init_core_assignments(int total_cores) { + if (s_l2_swimlane_aicpu_phase_header == nullptr) { return; } - memset(s_phase_header->core_to_thread, -1, sizeof(s_phase_header->core_to_thread)); - s_phase_header->num_cores = static_cast(total_cores); + memset( + s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread) + ); + s_l2_swimlane_aicpu_phase_header->num_cores = static_cast(total_cores); wmb(); LOG_INFO_V0("Core-to-thread mapping init: %d cores", total_cores); } -void l2_perf_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) { - if (s_phase_header == nullptr) { +void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) { + if (s_l2_swimlane_aicpu_phase_header == nullptr) { return; } for (int i = 0; i < core_num; i++) { int core_id = core_ids[i]; if (core_id >= 0 && core_id < PLATFORM_MAX_CORES) { - s_phase_header->core_to_thread[core_id] = static_cast(thread_idx); + s_l2_swimlane_aicpu_phase_header->core_to_thread[core_id] = static_cast(thread_idx); } } wmb(); diff --git a/src/a2a3/platform/src/host/l2_perf_collector.cpp b/src/a2a3/platform/src/host/l2_swimlane_collector.cpp similarity index 82% rename from src/a2a3/platform/src/host/l2_perf_collector.cpp rename to src/a2a3/platform/src/host/l2_swimlane_collector.cpp index 98c72a928..c06d083e1 100644 --- a/src/a2a3/platform/src/host/l2_perf_collector.cpp +++ b/src/a2a3/platform/src/host/l2_swimlane_collector.cpp @@ -10,15 +10,15 @@ */ /** - * @file l2_perf_collector.cpp + * @file l2_swimlane_collector.cpp * @brief Performance data collector implementation. The mgmt-thread + buffer-pool * machinery lives in profiling_common::BufferPoolManager parameterized by - * L2PerfModule (host/l2_perf_collector.h); the poll loop lives in + * L2SwimlaneModule (host/l2_swimlane_collector.h); the poll loop lives in * profiling_common::ProfilerBase. This file owns the per-buffer * on_buffer_collected callback and the export logic. */ -#include "host/l2_perf_collector.h" +#include "host/l2_swimlane_collector.h" #include #include @@ -36,7 +36,7 @@ #include "common/unified_log.h" // ============================================================================= -// L2PerfCollector Implementation +// L2SwimlaneCollector Implementation // ============================================================================= /** @@ -57,16 +57,16 @@ * because idle is reconstructed from record gaps. */ static constexpr uint32_t kAicpuOrchPhaseIdBase = 16; -static bool is_scheduler_phase(AicpuPhaseId id) { return static_cast(id) < kAicpuOrchPhaseIdBase; } +static bool is_scheduler_phase(L2SwimlaneAicpuPhaseId id) { return static_cast(id) < kAicpuOrchPhaseIdBase; } -L2PerfCollector::~L2PerfCollector() { +L2SwimlaneCollector::~L2SwimlaneCollector() { stop(); if (shm_host_ != nullptr) { - LOG_WARN("L2PerfCollector destroyed without finalize()"); + LOG_WARN("L2SwimlaneCollector destroyed without finalize()"); } } -void *L2PerfCollector::alloc_single_buffer(size_t size, void **host_ptr_out) { +void *L2SwimlaneCollector::alloc_single_buffer(size_t size, void **host_ptr_out) { void *dev_ptr = alloc_cb_(size); if (dev_ptr == nullptr) { LOG_ERROR("Failed to allocate buffer (%zu bytes)", size); @@ -92,12 +92,12 @@ void *L2PerfCollector::alloc_single_buffer(size_t size, void **host_ptr_out) { return dev_ptr; } -int L2PerfCollector::initialize( - int num_aicore, int device_id, L2PerfLevel l2_perf_level, const L2PerfAllocCallback &alloc_cb, - L2PerfRegisterCallback register_cb, const L2PerfFreeCallback &free_cb, const std::string &output_prefix +int L2SwimlaneCollector::initialize( + int num_aicore, int device_id, L2SwimlaneLevel l2_swimlane_level, const L2SwimlaneAllocCallback &alloc_cb, + L2SwimlaneRegisterCallback register_cb, const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix ) { if (shm_host_ != nullptr) { - LOG_ERROR("L2PerfCollector already initialized"); + LOG_ERROR("L2SwimlaneCollector already initialized"); return -1; } @@ -109,7 +109,7 @@ int L2PerfCollector::initialize( } num_aicore_ = num_aicore; - l2_perf_level_ = l2_perf_level; + l2_swimlane_level_ = l2_swimlane_level; output_prefix_ = output_prefix; total_perf_collected_ = 0; total_phase_collected_ = 0; @@ -126,9 +126,9 @@ int L2PerfCollector::initialize( LOG_DEBUG("Shared memory allocation plan:"); LOG_DEBUG(" Number of cores: %d", num_aicore); - LOG_DEBUG(" Header size: %zu bytes", sizeof(L2PerfDataHeader)); - LOG_DEBUG(" L2PerfBufferState size: %zu bytes each", sizeof(L2PerfBufferState)); - LOG_DEBUG(" PhaseBufferState size:%zu bytes each", sizeof(PhaseBufferState)); + LOG_DEBUG(" Header size: %zu bytes", sizeof(L2SwimlaneDataHeader)); + LOG_DEBUG(" L2SwimlaneAicpuTaskPool size: %zu bytes each", sizeof(L2SwimlaneAicpuTaskPool)); + LOG_DEBUG(" L2SwimlaneAicpuPhasePool size:%zu bytes each", sizeof(L2SwimlaneAicpuPhasePool)); LOG_DEBUG(" Total shared memory: %zu bytes (%zu KB)", total_size, total_size / 1024); // Step 2: Allocate shared memory for slot arrays @@ -158,7 +158,7 @@ int L2PerfCollector::initialize( } // Step 4: Initialize header - L2PerfDataHeader *header = get_l2_perf_header(perf_host_ptr); + L2SwimlaneDataHeader *header = get_l2_swimlane_header(perf_host_ptr); for (int t = 0; t < PLATFORM_MAX_AICPU_THREADS; t++) { memset(header->queues[t], 0, sizeof(header->queues[t])); @@ -167,18 +167,18 @@ int L2PerfCollector::initialize( } header->num_cores = num_aicore; - header->l2_perf_level = static_cast(l2_perf_level_); + header->l2_swimlane_level = static_cast(l2_swimlane_level_); - LOG_DEBUG("Initialized L2PerfDataHeader:"); + LOG_DEBUG("Initialized L2SwimlaneDataHeader:"); LOG_DEBUG(" num_cores: %d", header->num_cores); - LOG_DEBUG(" l2_perf_level: %u", header->l2_perf_level); + LOG_DEBUG(" l2_swimlane_level: %u", header->l2_swimlane_level); LOG_DEBUG(" buffer_capacity: %d", PLATFORM_PROF_BUFFER_SIZE); LOG_DEBUG(" queue capacity: %d", PLATFORM_PROF_READYQUEUE_SIZE); - // Step 5: Initialize L2PerfBufferStates — 1 buffer per core in free_queue, rest to recycled pool + // Step 5: Initialize L2SwimlaneAicpuTaskPools — 1 buffer per core in free_queue, rest to recycled pool for (int i = 0; i < num_aicore; i++) { - L2PerfBufferState *state = get_perf_buffer_state(perf_host_ptr, i); - memset(state, 0, sizeof(L2PerfBufferState)); + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(perf_host_ptr, i); + memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool)); state->free_queue.head = 0; state->free_queue.tail = 0; @@ -187,19 +187,19 @@ int L2PerfCollector::initialize( for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_CORE; s++) { void *host_buf_ptr = nullptr; - void *dev_buf_ptr = alloc_single_buffer(sizeof(L2PerfBuffer), &host_buf_ptr); + void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuTaskBuffer), &host_buf_ptr); if (dev_buf_ptr == nullptr) { - LOG_ERROR("Failed to allocate L2PerfBuffer for core %d, buffer %d", i, s); + LOG_ERROR("Failed to allocate L2SwimlaneAicpuTaskBuffer for core %d, buffer %d", i, s); return -1; } - L2PerfBuffer *buf = reinterpret_cast(host_buf_ptr); - memset(buf, 0, sizeof(L2PerfBuffer)); + L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast(host_buf_ptr); + memset(buf, 0, sizeof(L2SwimlaneAicpuTaskBuffer)); buf->count = 0; if (s == 0) { state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); } else { - manager_.push_recycled(static_cast(ProfBufferType::PERF_RECORD), dev_buf_ptr); + manager_.push_recycled(static_cast(ProfBufferType::AICPU_TASK), dev_buf_ptr); } } wmb(); @@ -207,27 +207,27 @@ int L2PerfCollector::initialize( wmb(); } - // Step 5b: Initialize L2PerfAicoreBufferStates — per-core AICore rotation + // Step 5b: Initialize L2SwimlaneAicoreTaskPools — per-core AICore rotation // channel + buffer pool. Same SPSC pattern as the AICPU pool above. for (int i = 0; i < num_aicore; i++) { - L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(perf_host_ptr, num_aicore, i); - memset(ac_state, 0, sizeof(L2PerfAicoreBufferState)); + L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(perf_host_ptr, num_aicore, i); + memset(ac_state, 0, sizeof(L2SwimlaneAicoreTaskPool)); for (int s = 0; s < PLATFORM_AICORE_BUFFERS_PER_CORE; s++) { void *host_buf_ptr = nullptr; - void *dev_buf_ptr = alloc_single_buffer(sizeof(L2PerfAicoreBuffer), &host_buf_ptr); + void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicoreTaskBuffer), &host_buf_ptr); if (dev_buf_ptr == nullptr) { - LOG_ERROR("Failed to allocate L2PerfAicoreBuffer for core %d, buffer %d", i, s); + LOG_ERROR("Failed to allocate L2SwimlaneAicoreTaskBuffer for core %d, buffer %d", i, s); return -1; } - L2PerfAicoreBuffer *buf = reinterpret_cast(host_buf_ptr); - memset(buf, 0, sizeof(L2PerfAicoreBuffer)); + L2SwimlaneAicoreTaskBuffer *buf = reinterpret_cast(host_buf_ptr); + memset(buf, 0, sizeof(L2SwimlaneAicoreTaskBuffer)); buf->count = 0; if (s == 0) { ac_state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); } else { - manager_.push_recycled(static_cast(ProfBufferType::AICORE), dev_buf_ptr); + manager_.push_recycled(static_cast(ProfBufferType::AICORE_TASK), dev_buf_ptr); } } wmb(); @@ -235,15 +235,16 @@ int L2PerfCollector::initialize( wmb(); } LOG_DEBUG( - "Initialized buffer pools: %d L2PerfBuffers/core + %d L2PerfAicoreBuffers/core (1 in free_queue, " + "Initialized buffer pools: %d L2SwimlaneAicpuTaskBuffers/core + %d L2SwimlaneAicoreTaskBuffers/core (1 in " + "free_queue, " "rest in recycled pool)", PLATFORM_PROF_BUFFERS_PER_CORE, PLATFORM_AICORE_BUFFERS_PER_CORE ); // Step 5c: Standalone uint64_t[num_aicore] table that will hold per-core - // AicoreRotation device addresses. Host only allocates the bytes and - // hands the device pointer to AICPU via KernelArgs::aicore_ring_addr; - // AICPU itself fills the entries inside `l2_perf_aicpu_init` (it has + // L2SwimlaneAicoreRotation device addresses. Host only allocates the bytes and + // hands the device pointer to AICPU via KernelArgs::l2_swimlane_aicore_rotation_table; + // AICPU itself fills the entries inside `l2_swimlane_aicpu_init` (it has // direct access to `&ac_state->rotation` device addresses, no // host-to-device translation needed). AICore reads // rotation_table[block_idx] at kernel entry. @@ -252,7 +253,7 @@ int L2PerfCollector::initialize( void *rotation_table_host = nullptr; void *rotation_table_dev = alloc_single_buffer(table_bytes, &rotation_table_host); if (rotation_table_dev == nullptr) { - LOG_ERROR("Failed to allocate aicore_ring_addr (rotation) table (%zu bytes)", table_bytes); + LOG_ERROR("Failed to allocate l2_swimlane_aicore_rotation_table (rotation) table (%zu bytes)", table_bytes); return -1; } aicore_ring_addr_table_dev_ = rotation_table_dev; @@ -260,8 +261,8 @@ int L2PerfCollector::initialize( // Step 6: Initialize PhaseBufferStates — 1 buffer per thread in free_queue, rest to recycled pool for (int t = 0; t < num_phase_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t); - memset(state, 0, sizeof(PhaseBufferState)); + L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t); + memset(state, 0, sizeof(L2SwimlaneAicpuPhasePool)); state->free_queue.head = 0; state->free_queue.tail = 0; @@ -270,19 +271,19 @@ int L2PerfCollector::initialize( for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) { void *host_buf_ptr = nullptr; - void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr); + void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuPhaseBuffer), &host_buf_ptr); if (dev_buf_ptr == nullptr) { - LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s); + LOG_ERROR("Failed to allocate L2SwimlaneAicpuPhaseBuffer for thread %d, buffer %d", t, s); return -1; } - PhaseBuffer *buf = reinterpret_cast(host_buf_ptr); - memset(buf, 0, sizeof(PhaseBuffer)); + L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast(host_buf_ptr); + memset(buf, 0, sizeof(L2SwimlaneAicpuPhaseBuffer)); buf->count = 0; if (s == 0) { state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); } else { - manager_.push_recycled(static_cast(ProfBufferType::PHASE), dev_buf_ptr); + manager_.push_recycled(static_cast(ProfBufferType::AICPU_PHASE), dev_buf_ptr); } } wmb(); @@ -297,8 +298,8 @@ int L2PerfCollector::initialize( wmb(); // Step 7: Stash device pointer for the caller to publish via - // kernel_args.l2_perf_data_base (read back via get_l2_perf_setup_device_ptr()). - LOG_DEBUG("L2 perf device base = 0x%lx", reinterpret_cast(perf_dev_ptr)); + // kernel_args.l2_swimlane_data_base (read back via get_l2_swimlane_setup_device_ptr()). + LOG_DEBUG("L2 swimlane device base = 0x%lx", reinterpret_cast(perf_dev_ptr)); perf_shared_mem_dev_ = perf_dev_ptr; shm_host_ = perf_host_ptr; @@ -315,8 +316,8 @@ int L2PerfCollector::initialize( // ProfilerBase callbacks // --------------------------------------------------------------------------- -void L2PerfCollector::copy_perf_buffer(const ReadyBufferInfo &info) { - L2PerfBuffer *buf = reinterpret_cast(info.host_buffer_ptr); +void L2SwimlaneCollector::copy_perf_buffer(const ReadyBufferInfo &info) { + L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast(info.host_buffer_ptr); rmb(); uint32_t count = buf->count; if (count > PLATFORM_PROF_BUFFER_SIZE) { @@ -331,8 +332,8 @@ void L2PerfCollector::copy_perf_buffer(const ReadyBufferInfo &info) { } } -void L2PerfCollector::copy_phase_buffer(const ReadyBufferInfo &info) { - PhaseBuffer *buf = reinterpret_cast(info.host_buffer_ptr); +void L2SwimlaneCollector::copy_phase_buffer(const ReadyBufferInfo &info) { + L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast(info.host_buffer_ptr); rmb(); uint32_t count = buf->count; if (count > static_cast(PLATFORM_PHASE_RECORDS_PER_THREAD)) { @@ -369,8 +370,8 @@ void L2PerfCollector::copy_phase_buffer(const ReadyBufferInfo &info) { // propagated). The "missing" slot's previous contents are zero because // allocate_single_buffer memsets at allocation. // - Flush-path partial buffer whose tail wasn't reached. -void L2PerfCollector::copy_aicore_buffer(const ReadyBufferInfo &info) { - L2PerfAicoreBuffer *buf = reinterpret_cast(info.host_buffer_ptr); +void L2SwimlaneCollector::copy_aicore_buffer(const ReadyBufferInfo &info) { + L2SwimlaneAicoreTaskBuffer *buf = reinterpret_cast(info.host_buffer_ptr); rmb(); uint32_t core_index = info.index; if (core_index >= static_cast(num_aicore_)) { @@ -384,7 +385,7 @@ void L2PerfCollector::copy_aicore_buffer(const ReadyBufferInfo &info) { dst.reserve(dst.size() + count); uint32_t skipped = 0; for (uint32_t i = 0; i < count; i++) { - const L2PerfAicoreRecord &r = buf->records[i]; + const L2SwimlaneAicoreTaskRecord &r = buf->records[i]; if (r.start_time == 0) { skipped++; continue; @@ -400,10 +401,10 @@ void L2PerfCollector::copy_aicore_buffer(const ReadyBufferInfo &info) { } } -void L2PerfCollector::on_buffer_collected(const ReadyBufferInfo &info) { - if (info.type == ProfBufferType::PERF_RECORD) { +void L2SwimlaneCollector::on_buffer_collected(const ReadyBufferInfo &info) { + if (info.type == ProfBufferType::AICPU_TASK) { copy_perf_buffer(info); - } else if (info.type == ProfBufferType::PHASE) { + } else if (info.type == ProfBufferType::AICPU_PHASE) { copy_phase_buffer(info); } else { copy_aicore_buffer(info); @@ -419,7 +420,7 @@ void L2PerfCollector::on_buffer_collected(const ReadyBufferInfo &info) { // clear current_buf_ptr on the device side. Host's job here is purely // accounting + sanity check. -void L2PerfCollector::reconcile_counters() { +void L2SwimlaneCollector::reconcile_counters() { if (shm_host_ == nullptr) { return; } @@ -431,9 +432,7 @@ void L2PerfCollector::reconcile_counters() { // queue full / flush failure) bump dropped_record_count. // silent_loss = device_total - (collected + dropped) // and any non-zero silent loss flags an unaccounted gap on top of the - // already-classified dropped losses. `mismatch_record_count` remains in - // L2PerfBufferState for ABI continuity but is no longer written — the - // AICore staging-slot read it guarded was removed. + // already-classified dropped losses. // // Sanity sub-check: after stop(), any active buffer with records must // have been flushed by AICPU (success → current_buf_ptr=0; failure → @@ -445,7 +444,7 @@ void L2PerfCollector::reconcile_counters() { auto read_buf_count, uint64_t collected, bool optional) { int leftover_active = 0; for (int i = 0; i < unit_count; i++) { - L2PerfBufferState *state = get_state(i); + L2SwimlaneAicpuTaskPool *state = get_state(i); uint64_t buf_ptr = state->current_buf_ptr; if (buf_ptr == 0) continue; void *host_ptr = manager_.resolve_host_ptr(reinterpret_cast(buf_ptr)); @@ -453,7 +452,7 @@ void L2PerfCollector::reconcile_counters() { uint32_t count = read_buf_count(host_ptr); if (count == 0) continue; LOG_ERROR( - "L2Perf reconcile: %s %d has un-flushed %s buffer (current_buf_ptr=0x%lx, count=%u) " + "L2Swimlane reconcile: %s %d has un-flushed %s buffer (current_buf_ptr=0x%lx, count=%u) " "after stop() — device flush failed", unit_name, i, kind, static_cast(buf_ptr), count ); @@ -462,55 +461,44 @@ void L2PerfCollector::reconcile_counters() { uint64_t total_device = 0; uint64_t dropped_device = 0; - uint64_t mismatch_device = 0; for (int i = 0; i < unit_count; i++) { - L2PerfBufferState *state = get_state(i); + L2SwimlaneAicpuTaskPool *state = get_state(i); total_device += state->total_record_count; dropped_device += state->dropped_record_count; - mismatch_device += state->mismatch_record_count; } // PHASE counters are populated only by runtimes that actually emit // phase records; skip the comparison entirely when nothing happened. - if (optional && total_device == 0 && collected == 0 && dropped_device == 0 && mismatch_device == 0) { + if (optional && total_device == 0 && collected == 0 && dropped_device == 0) { return; } if (dropped_device > 0) { LOG_WARN( - "L2Perf reconcile: %lu %s records dropped on device side (buffer full / " + "L2Swimlane reconcile: %lu %s records dropped on device side (buffer full / " "ready_queue full).", static_cast(dropped_device), kind ); } - if (mismatch_device > 0) { - LOG_ERROR( - "L2Perf reconcile: %lu %s records carry non-zero mismatch_record_count — " - "this counter is no longer written post-AICore-as-producer; non-zero " - "indicates stale device state or a corrupted L2PerfBufferState", - static_cast(mismatch_device), kind - ); - } - uint64_t accounted = collected + dropped_device + mismatch_device; + uint64_t accounted = collected + dropped_device; if (accounted != total_device) { LOG_WARN( - "L2Perf reconcile: %s count mismatch (collected=%lu + dropped=%lu + mismatch=%lu != " + "L2Swimlane reconcile: %s count mismatch (collected=%lu + dropped=%lu != " "device_total=%lu, silent_loss=%ld)", kind, static_cast(collected), static_cast(dropped_device), - static_cast(mismatch_device), static_cast(total_device), - static_cast(total_device) - static_cast(accounted) + static_cast(total_device), static_cast(total_device) - static_cast(accounted) ); } else { LOG_INFO_V0( - "L2Perf reconcile: %s counts match (collected=%lu, dropped=%lu, mismatch=%lu, device_total=%lu)", kind, + "L2Swimlane reconcile: %s counts match (collected=%lu, dropped=%lu, device_total=%lu)", kind, static_cast(collected), static_cast(dropped_device), - static_cast(mismatch_device), static_cast(total_device) + static_cast(total_device) ); } if (leftover_active > 0) { LOG_ERROR( - "L2Perf reconcile: %d %s(s) had un-cleared %s current_buf_ptr — see prior errors", leftover_active, + "L2Swimlane reconcile: %d %s(s) had un-cleared %s current_buf_ptr — see prior errors", leftover_active, unit_name, kind ); } @@ -522,7 +510,7 @@ void L2PerfCollector::reconcile_counters() { return get_perf_buffer_state(shm_host_, core_index); }, [](void *host_ptr) { - return reinterpret_cast(host_ptr)->count; + return reinterpret_cast(host_ptr)->count; }, total_perf_collected_, /*optional=*/false ); @@ -533,24 +521,25 @@ void L2PerfCollector::reconcile_counters() { return get_phase_buffer_state(shm_host_, num_aicore_, thread_index); }, [](void *host_ptr) { - return reinterpret_cast(host_ptr)->count; + return reinterpret_cast(host_ptr)->count; }, total_phase_collected_, /*optional=*/true ); } -void L2PerfCollector::read_phase_header_metadata() { +void L2SwimlaneCollector::read_phase_header_metadata() { if (shm_host_ == nullptr) { return; } rmb(); - AicpuPhaseHeader *phase_header = get_phase_header(shm_host_, num_aicore_); + L2SwimlaneAicpuPhaseHeader *phase_header = get_phase_header(shm_host_, num_aicore_); - if (phase_header->magic != AICPU_PHASE_MAGIC) { + if (phase_header->magic != L2_SWIMLANE_AICPU_PHASE_MAGIC) { LOG_INFO_V0( - "No phase profiling data found (magic mismatch: 0x%x vs 0x%x)", phase_header->magic, AICPU_PHASE_MAGIC + "No phase profiling data found (magic mismatch: 0x%x vs 0x%x)", phase_header->magic, + L2_SWIMLANE_AICPU_PHASE_MAGIC ); return; } @@ -593,7 +582,7 @@ void L2PerfCollector::read_phase_header_metadata() { LOG_INFO_V0("Phase metadata collection complete: has_phase_data=%s", has_phase_data_ ? "yes" : "no"); } -// AICore-as-producer post-processing: walk each L2PerfRecord we collected +// AICore-as-producer post-processing: walk each L2SwimlaneAicpuTaskRecord we collected // and patch start/end/duration from the per-core stream of AICore records // that arrived through the ready queue. AICore rotation guarantees each // per-core stream is a complete prefix of "all dispatched tasks on this @@ -601,11 +590,11 @@ void L2PerfCollector::read_phase_header_metadata() { // free_queue while the session runs, so an arbitrarily long session works). // // We build a small `reg_task_id → (start, end)` map per core (size on the -// order of N_tasks_per_core) and patch each L2PerfRecord by its +// order of N_tasks_per_core) and patch each L2SwimlaneAicpuTaskRecord by its // reg_task_id field. Using a map instead of direct indexing tolerates -// AICPU-side L2PerfBuffer drops (a missing L2PerfRecord doesn't break +// AICPU-side L2SwimlaneAicpuTaskBuffer drops (a missing L2SwimlaneAicpuTaskRecord doesn't break // alignment) and lets the same code work for both runtimes. -void L2PerfCollector::join_aicore_records() { +void L2SwimlaneCollector::join_aicore_records() { if (shm_host_ == nullptr) { return; } @@ -678,7 +667,8 @@ void L2PerfCollector::join_aicore_records() { total_unmatched += unmatched; if (unmatched > 0) { LOG_WARN( - "Core %d: %lu L2PerfRecord(s) had no matching AICore entry (AICore buffer drops on rotation? " + "Core %d: %lu L2SwimlaneAicpuTaskRecord(s) had no matching AICore entry (AICore buffer drops on " + "rotation? " "PLATFORM_AICORE_BUFFERS_PER_CORE=%d may be undersized for host drain rate)", core_idx, static_cast(unmatched), PLATFORM_AICORE_BUFFERS_PER_CORE ); @@ -691,7 +681,7 @@ void L2PerfCollector::join_aicore_records() { ); } -int L2PerfCollector::export_swimlane_json() { +int L2SwimlaneCollector::export_swimlane_json() { // Step 0: Join AICore-emitted start/end/task_id records into the AICPU // record stream (AICore-as-producer design). join_aicore_records(); @@ -721,7 +711,7 @@ int L2PerfCollector::export_swimlane_json() { // Step 3: Flatten per-core vectors into tagged records with core_id derived from index struct TaggedRecord { - const L2PerfRecord *record; + const L2SwimlaneAicpuTaskRecord *record; uint32_t core_id; }; std::vector tagged_records; @@ -769,7 +759,7 @@ int L2PerfCollector::export_swimlane_json() { // Step 5: Compose output path. Filename is fixed (no timestamp) — the // caller-provided directory is the per-task uniqueness boundary. - std::string filepath = output_prefix_ + "/l2_perf_records.json"; + std::string filepath = output_prefix_ + "/l2_swimlane_records.json"; // Step 6: Open JSON file for writing std::ofstream outfile(filepath); @@ -782,16 +772,16 @@ int L2PerfCollector::export_swimlane_json() { // Fanout fields are emitted as empty/zero — the device-side hot path no // longer carries them. Downstream (swimlane_converter.py) joins fanout // from the sibling deps.json (dep_gen output). - int l2_perf_level = static_cast(l2_perf_level_); + int l2_swimlane_level = static_cast(l2_swimlane_level_); outfile << "{\n"; - outfile << " \"l2_perf_level\": " << l2_perf_level << ",\n"; + outfile << " \"l2_swimlane_level\": " << l2_swimlane_level << ",\n"; outfile << " \"tasks\": [\n"; // First pass: filter unmatched records (start_time == 0) so we emit a // valid JSON without trailing-comma fix-ups. Unmatched records arise when // the AICore-side rotation dropped a buffer (free queue empty) and that // task's AICore record never made it to the host, leaving the AICPU-side - // L2PerfRecord with `start_time == 0`. Subtracting base_time_cycles from + // L2SwimlaneAicpuTaskRecord with `start_time == 0`. Subtracting base_time_cycles from // 0 would underflow to a huge double timestamp, painting an off-the-chart // bar in the swimlane viewer; safer to drop the record. The drop count is // already surfaced via `dropped_record_count` and the join warning logged @@ -845,12 +835,12 @@ int L2PerfCollector::export_swimlane_json() { outfile << " ]"; // Step 8: Write phase profiling data (level >= 3) - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - auto sched_phase_name = [](AicpuPhaseId id) -> const char * { + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + auto sched_phase_name = [](L2SwimlaneAicpuPhaseId id) -> const char * { switch (id) { - case AicpuPhaseId::SCHED_COMPLETE: + case L2SwimlaneAicpuPhaseId::SCHED_COMPLETE: return "complete"; - case AicpuPhaseId::SCHED_DISPATCH: + case L2SwimlaneAicpuPhaseId::SCHED_DISPATCH: return "dispatch"; default: // Legacy SCHED_IDLE_WAIT (3) and SCHED_SCAN (2) land here on @@ -861,9 +851,9 @@ int L2PerfCollector::export_swimlane_json() { } }; - auto orch_phase_name = [](AicpuPhaseId id) -> const char * { + auto orch_phase_name = [](L2SwimlaneAicpuPhaseId id) -> const char * { switch (id) { - case AicpuPhaseId::ORCH_SUBMIT: + case L2SwimlaneAicpuPhaseId::ORCH_SUBMIT: return "orch_submit"; default: // Legacy per-sub-step orch ids 17-24 land here on old captures; @@ -889,7 +879,7 @@ int L2PerfCollector::export_swimlane_json() { // Phase-specific deltas (currently only SCHED_DISPATCH carries // pop_hit / pop_miss). Other phases pass zero extras; omitting // them keeps the JSON terse per record. - if (pr.phase_id == AicpuPhaseId::SCHED_DISPATCH) { + if (pr.phase_id == L2SwimlaneAicpuPhaseId::SCHED_DISPATCH) { outfile << ", \"pop_hit\": " << pr.extra1 << ", \"pop_miss\": " << pr.extra2; } outfile << "}"; @@ -903,14 +893,14 @@ int L2PerfCollector::export_swimlane_json() { outfile << " ]"; // Orchestrator timing is no longer emitted as a separate aggregate - // block. Per-event AicpuPhaseRecord[] entries (emitted as + // block. Per-event L2SwimlaneAicpuPhaseRecord[] entries (emitted as // aicpu_orchestrator_phases below) are the single source of truth; // the run-window envelope is still visible in the device-side // LOG_INFO_V9 "Thread N: orch_start=… orch_end=… orch_cost=…" line. // Per-task orchestrator phase records (level >= 4, filtered from unified collected_phase_records_) bool has_orch_phases = false; - if (l2_perf_level_ >= L2PerfLevel::ORCH_PHASES) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) { for (const auto &v : collected_phase_records_) { for (const auto &r : v) { if (!is_scheduler_phase(r.phase_id)) { @@ -969,7 +959,7 @@ int L2PerfCollector::export_swimlane_json() { return 0; } -int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2PerfFreeCallback &free_cb) { +int L2SwimlaneCollector::finalize(L2SwimlaneUnregisterCallback unregister_cb, const L2SwimlaneFreeCallback &free_cb) { if (shm_host_ == nullptr) { return 0; } @@ -984,10 +974,10 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe // alloc_single_buffer installed via halHostRegister is unregistered // before its device memory is freed. Without this the Ascend HAL's // per-device registration table accumulates leaked entries across - // init_l2_perf() invocations and back-to-back l2_swimlane tests on + // init_l2_swimlane() invocations and back-to-back l2_swimlane tests on // a reused Worker fail at rc=8 from halHostRegister. - // Free standalone aicore_ring_addr table + // Free standalone l2_swimlane_aicore_rotation_table table release_one_buffer(aicore_ring_addr_table_dev_, unregister_cb, free_cb); aicore_ring_addr_table_dev_ = nullptr; @@ -998,8 +988,8 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe // Per-core: current buffer + free_queue slots — these were owned by // the AICPU side, not the framework. Same drain pattern for both the - // L2PerfBuffer pool and the L2PerfAicoreBuffer pool. - auto drain_free_queue = [&](L2PerfFreeQueue &fq) { + // L2SwimlaneAicpuTaskBuffer pool and the L2SwimlaneAicoreTaskBuffer pool. + auto drain_free_queue = [&](L2SwimlaneFreeQueue &fq) { rmb(); uint32_t head = fq.head; uint32_t tail = fq.tail; @@ -1016,12 +1006,12 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe }; for (int i = 0; i < num_aicore_; i++) { - L2PerfBufferState *state = get_perf_buffer_state(shm_host_, i); + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm_host_, i); release_one_buffer(reinterpret_cast(state->current_buf_ptr), unregister_cb, free_cb); state->current_buf_ptr = 0; drain_free_queue(state->free_queue); - L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(shm_host_, num_aicore_, i); + L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(shm_host_, num_aicore_, i); release_one_buffer(reinterpret_cast(ac_state->rotation.current_buf_ptr), unregister_cb, free_cb); ac_state->rotation.current_buf_ptr = 0; drain_free_queue(ac_state->free_queue); @@ -1029,7 +1019,7 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe int num_phase_threads = PLATFORM_MAX_AICPU_THREADS; for (int t = 0; t < num_phase_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(shm_host_, num_aicore_, t); + L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm_host_, num_aicore_, t); release_one_buffer(reinterpret_cast(state->current_buf_ptr), unregister_cb, free_cb); state->current_buf_ptr = 0; diff --git a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp index c66b11ed9..33b1b6600 100644 --- a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp @@ -11,9 +11,9 @@ #include "aicore/aicore.h" #include "aicore/aicore_profiling_state.h" -#include "aicore/l2_perf_collector_aicore.h" +#include "aicore/l2_swimlane_collector_aicore.h" #include "aicore/pmu_collector_aicore.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" // Platform configuration (C/C++ compatible) #include "runtime.h" @@ -55,15 +55,15 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); uint32_t enable_profiling_flag = get_aicore_profiling_flag(); - bool l2_perf_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE); + bool l2_swimlane_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE); bool dump_tensor_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR); bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU); - // Per-core AicoreRotation channel; see tensormap_and_ringbuffer/.../aicore_executor.cpp. + // Per-core L2SwimlaneAicoreRotation channel; see tensormap_and_ringbuffer/.../aicore_executor.cpp. // Deferred until first task so AICPU's init has populated the rotation // table (the dispatch itself proves init is done). - __gm__ AicoreRotation *l2_perf_rotation = nullptr; - AicoreLocalState l2_perf_local = {nullptr, 0, 0}; + __gm__ L2SwimlaneAicoreRotation *l2_swimlane_rotation = nullptr; + L2SwimlaneAicoreLocalState l2_swimlane_local = {nullptr, 0, 0}; volatile uint32_t task_id = AICPU_IDLE_TASK_ID; volatile uint32_t last_task_id = AICPU_IDLE_TASK_ID; @@ -86,8 +86,8 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in write_reg(RegId::COND, MAKE_ACK_VALUE(actual_task_id)); // First-task lazy resolve of the rotation channel. - if (l2_perf_enabled && l2_perf_rotation == nullptr) { - l2_perf_rotation = get_aicore_rotation(); + if (l2_swimlane_enabled && l2_swimlane_rotation == nullptr) { + l2_swimlane_rotation = get_l2_swimlane_aicore_rotation(); } __gm__ Task *task_ptr = &(runtime->tasks[actual_task_id]); @@ -107,9 +107,11 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in pipe_barrier(PIPE_ALL); } - if (l2_perf_enabled) { + if (l2_swimlane_enabled) { uint64_t end_time = get_sys_cnt_aicore(); - l2_perf_aicore_record_task(l2_perf_rotation, &l2_perf_local, actual_task_id, start_time, end_time); + l2_swimlane_aicore_record_task( + l2_swimlane_rotation, &l2_swimlane_local, actual_task_id, start_time, end_time + ); } last_task_id = task_id; diff --git a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 62b13670b..770ec41dc 100644 --- a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -16,13 +16,13 @@ #include "aicpu/device_log.h" #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/platform_regs.h" #include "aicpu/pmu_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" #include "callable.h" #include "common/memory_barrier.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "common/unified_log.h" #include "runtime.h" @@ -140,7 +140,7 @@ struct AicpuExecutor { inline bool try_dispatch_task( int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head, - int &ready_count, bool l2_perf_enabled, Runtime &runtime + int &ready_count, bool l2_swimlane_enabled, Runtime &runtime ); }; @@ -239,7 +239,7 @@ inline void AicpuExecutor::resolve_task_dependencies( // Try to dispatch a task from thread-local queue to a core inline bool AicpuExecutor::try_dispatch_task( int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head, int &ready_count, - bool l2_perf_enabled, [[maybe_unused]] Runtime &runtime + bool l2_swimlane_enabled, [[maybe_unused]] Runtime &runtime ) { if (ready_count <= 0) { return false; @@ -251,8 +251,8 @@ inline bool AicpuExecutor::try_dispatch_task( ready_count--; // Profiling: record the real AICPU dispatch point for this core. Buffer - // rotation is handled inside l2_perf_aicpu_complete_record. - if (l2_perf_enabled && get_l2_perf_level() >= L2PerfLevel::AICPU_TIMING) { + // rotation is handled inside l2_swimlane_aicpu_complete_task. + if (l2_swimlane_enabled && get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } @@ -329,7 +329,7 @@ int AicpuExecutor::init(Runtime *runtime) { } if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_init(runtime->worker_count); + l2_swimlane_aicpu_init(runtime->worker_count); } // Perform core discovery: handshake with all cores and collect core type information @@ -679,8 +679,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const int verification_warning_count = 0; const int MAX_VERIFICATION_WARNINGS = 10; - bool l2_perf_enabled = is_l2_swimlane_enabled(); - L2PerfLevel l2_perf_level = get_l2_perf_level(); + bool l2_swimlane_enabled = is_l2_swimlane_enabled(); + L2SwimlaneLevel l2_swimlane_level = get_l2_swimlane_level(); // PMU runs require single-issue dispatch — overlapping in-flight tasks // pollute per-task PMU counters. Cached at function scope: // is_pmu_enabled() is extern "C" and the compiler cannot hoist it @@ -707,7 +707,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const ); // Initialize dispatch timestamps for all cores (only needed at level >= 2) - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { uint64_t dispatch_start_time = get_sys_cnt_aicpu(); for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; @@ -744,38 +744,38 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const // Profiling: when prev_running_id exists, its AICore timing was // published to the ring slot first, so complete it BEFORE the // pending task's record to maintain buffer ordering. - if (l2_perf_enabled) { - uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; + if (l2_swimlane_enabled) { + uint64_t finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; if (prev_running_id != AICPU_TASK_INVALID) { Task *prev_task = &runtime.tasks[prev_running_id]; - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(prev_running_id), static_cast(prev_running_id), prev_task->func_id, h->core_type, dispatch_timestamps_[core_id], finish_ts ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for implicit task %d", core_id, + "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id, prev_running_id ); } - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } - finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; + finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; Task *task = &runtime.tasks[completed_task_id]; - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(completed_task_id), static_cast(completed_task_id), task->func_id, h->core_type, dispatch_timestamps_[core_id], finish_ts ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for task %d", core_id, completed_task_id + "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id ); } - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } @@ -792,12 +792,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) { dispatched = try_dispatch_task( core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head, - cur_aic_ready_count, l2_perf_enabled, runtime + cur_aic_ready_count, l2_swimlane_enabled, runtime ); } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) { dispatched = try_dispatch_task( core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head, - cur_aiv_ready_count, l2_perf_enabled, runtime + cur_aiv_ready_count, l2_swimlane_enabled, runtime ); } @@ -829,7 +829,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const made_progress = true; // Update timestamp if didn't dispatch (try_dispatch_task updates it if dispatched) - if (!dispatched && l2_perf_enabled && l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (!dispatched && l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) { @@ -851,20 +851,21 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const // Count it here to avoid losing completion. if (prev_running_id != AICPU_TASK_INVALID) { // Profiling: complete the implicit task's AICore record - if (l2_perf_enabled) { - uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; + if (l2_swimlane_enabled) { + uint64_t finish_ts = + (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; Task *prev_task = &runtime.tasks[prev_running_id]; - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(prev_running_id), static_cast(prev_running_id), prev_task->func_id, h->core_type, dispatch_timestamps_[core_id], finish_ts ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for implicit task %d", core_id, + "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id, prev_running_id ); } - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } @@ -894,19 +895,19 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const int completed_task_id = running_task_ids_[core_id]; - if (l2_perf_enabled) { - uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; + if (l2_swimlane_enabled) { + uint64_t finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; Task *task = &runtime.tasks[completed_task_id]; - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(completed_task_id), static_cast(completed_task_id), task->func_id, h->core_type, dispatch_timestamps_[core_id], finish_ts ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for task %d", core_id, completed_task_id + "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id ); } - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } @@ -921,12 +922,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) { dispatched = try_dispatch_task( core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head, - cur_aic_ready_count, l2_perf_enabled, runtime + cur_aic_ready_count, l2_swimlane_enabled, runtime ); } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) { dispatched = try_dispatch_task( core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head, - cur_aiv_ready_count, l2_perf_enabled, runtime + cur_aiv_ready_count, l2_swimlane_enabled, runtime ); } } @@ -940,7 +941,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const made_progress = true; // Update timestamp if didn't dispatch (try_dispatch_task updates it if dispatched) - if (!dispatched && l2_perf_enabled && l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (!dispatched && l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } @@ -953,14 +954,14 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) { if (try_dispatch_task( core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head, - cur_aic_ready_count, l2_perf_enabled, runtime + cur_aic_ready_count, l2_swimlane_enabled, runtime )) { made_progress = true; } } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) { if (try_dispatch_task( core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head, - cur_aiv_ready_count, l2_perf_enabled, runtime + cur_aiv_ready_count, l2_swimlane_enabled, runtime )) { made_progress = true; } @@ -1099,7 +1100,7 @@ int AicpuExecutor::run(Runtime *runtime) { // Flush performance buffers for cores managed by this thread if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_flush_buffers(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]); + l2_swimlane_aicpu_flush(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]); } #if PTO2_PROFILING if (is_pmu_enabled()) { diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 346c3b9fd..44473ee0d 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -38,7 +38,7 @@ #include #include "common/core_type.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "pto_runtime2_types.h" #include "tensor_info.h" diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index 94e18b35e..488cb1785 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -11,9 +11,9 @@ #include "aicore/aicore.h" #include "aicore/aicore_profiling_state.h" -#include "aicore/l2_perf_collector_aicore.h" +#include "aicore/l2_swimlane_collector_aicore.h" #include "aicore/pmu_collector_aicore.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" // Register-based communication #include "pto2_dispatch_payload.h" #include "runtime.h" @@ -57,8 +57,8 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ PTO2Di * args pointer from it on each dispatch. reg_val is a monotonically * increasing task ID used only for dispatch signaling and ACK/FIN protocol. * - * Profiling state (enable flag, L2 perf ring) is published into the platform - * via set_aicore_profiling_flag / set_aicore_l2_perf_ring at kernel entry — + * Profiling state (enable flag, L2 swimlane rotation channel) is published into the platform + * via set_aicore_profiling_flag / set_aicore_l2_swimlane_ring at kernel entry — * this routine reads it through the matching getters, so neither Handshake * nor this signature carry profiling fields. * @@ -98,19 +98,19 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task); uint32_t enable_profiling_flag = get_aicore_profiling_flag(); - bool l2_perf_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE); + bool l2_swimlane_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE); bool dump_tensor_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR); bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU); - // Per-core AicoreRotation channel. The pointer to THIS core's rotation - // is stored in `KernelArgs::aicore_ring_addr[block_idx]`, but AICPU - // populates that table inside `l2_perf_aicpu_init` which runs + // Per-core L2SwimlaneAicoreRotation channel. The pointer to THIS core's rotation + // is stored in `KernelArgs::l2_swimlane_aicore_rotation_table[block_idx]`, but AICPU + // populates that table inside `l2_swimlane_aicpu_init` which runs // concurrently with this kernel's entry — so we cannot deref at startup. - // Defer the deref via `get_aicore_rotation()` until the first task is + // Defer the deref via `get_l2_swimlane_aicore_rotation()` until the first task is // dispatched; by then AICPU's init has completed (the very dispatch is // proof of that). - __gm__ AicoreRotation *l2_perf_rotation = nullptr; - AicoreLocalState l2_perf_local = {nullptr, 0, 0}; + __gm__ L2SwimlaneAicoreRotation *l2_swimlane_rotation = nullptr; + L2SwimlaneAicoreLocalState l2_swimlane_local = {nullptr, 0, 0}; // Phase 4: Main execution loop - poll register for tasks until exit signal // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit @@ -135,10 +135,10 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in uint32_t task_id = reg_val; // Decode: register holds task_id directly // First-task lazy resolve of the rotation channel — see comment - // above. `get_aicore_rotation()` caches after first call so this + // above. `get_l2_swimlane_aicore_rotation()` caches after first call so this // costs nothing on subsequent tasks. - if (l2_perf_enabled && l2_perf_rotation == nullptr) { - l2_perf_rotation = get_aicore_rotation(); + if (l2_swimlane_enabled && l2_swimlane_rotation == nullptr) { + l2_swimlane_rotation = get_l2_swimlane_aicore_rotation(); } // Select dual-buffer slot: same bit as AICPU used when writing payload @@ -169,9 +169,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in } // Performance profiling: record task execution - if (l2_perf_enabled) { + if (l2_swimlane_enabled) { uint64_t end_time = get_sys_cnt_aicore(); - l2_perf_aicore_record_task(l2_perf_rotation, &l2_perf_local, task_id, start_time, end_time); + l2_swimlane_aicore_record_task(l2_swimlane_rotation, &l2_swimlane_local, task_id, start_time, end_time); } last_reg_val = reg_val; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 7a4966361..a7464b25e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -35,11 +35,11 @@ #include "pto_shared_memory.h" // Performance profiling headers -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/scope_stats_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" #include "aicpu/dep_gen_collector_aicpu.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/unified_log.h" // Register-based communication @@ -521,7 +521,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // device address nor know the SchedulerContext's core fan-out). runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count()); #if PTO2_PROFILING - rt->orchestrator.l2_perf_level = get_l2_perf_level(); + rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level(); { auto &orch = rt->orchestrator; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { @@ -547,8 +547,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { sched_ctx_.wait_pto2_init_complete(); #if PTO2_PROFILING - if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { - l2_perf_aicpu_set_orch_thread_idx(thread_idx); + if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { + l2_swimlane_aicpu_set_orch_thread_idx(thread_idx); } #endif @@ -663,7 +663,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line // below carries the same envelope info for debugging, and // host-side swimlane derives per-phase timing from the per-event - // AicpuPhaseRecord[] stream that already covers everything inside + // L2SwimlaneAicpuPhaseRecord[] stream that already covers everything inside // submit_task(). int32_t total_tasks = 0; if (rt->orchestrator.sm_header) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index 863299dbc..e670688a6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -166,8 +166,8 @@ Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX ``` Per-thread fanout / fanin edge counts and ready-queue pop hit / miss -stats live in `aicpu_scheduler_phases[]` (in `l2_perf_records.json` -captured at l2_perf_level >= 3) and `deps.json`; consume them via +stats live in `aicpu_scheduler_phases[]` (in `l2_swimlane_records.json` +captured at l2_swimlane_level >= 3) and `deps.json`; consume them via `simpler_setup/tools/sched_overhead_analysis.py`. --- @@ -241,10 +241,10 @@ mirrors the PMU pattern — two independent channels (one binary, one int): (`PROFILING_FLAG_L2_SWIMLANE`). Set by the host whenever level > 0; read by AICore (which only needs on/off to decide whether to write timing) and by AICPU kernel entry via `set_l2_swimlane_enabled(bool)`. -- **Granular level (0–4)** — `L2PerfDataHeader::l2_perf_level` - (shared memory). Host writes it in `L2PerfCollector::initialize`; AICPU - promotes it from the header in `l2_perf_aicpu_init` and exposes it via - `get_l2_perf_level()` (typed `L2PerfLevel`) for +- **Granular level (0–4)** — `L2SwimlaneDataHeader::l2_swimlane_level` + (shared memory). Host writes it in `L2SwimlaneCollector::initialize`; AICPU + promotes it from the header in `l2_swimlane_aicpu_init` and exposes it via + `get_l2_swimlane_level()` (typed `L2SwimlaneLevel`) for `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates. On sim, the binary on/off travels via the dlsym'd `set_l2_swimlane_enabled` @@ -266,7 +266,7 @@ Bare `--enable-l2-swimlane` = level 4 (backward compatible). ### Level gating in AICPU code -Use the strongly-typed `L2PerfLevel` enum so each gate names the +Use the strongly-typed `L2SwimlaneLevel` enum so each gate names the content it depends on instead of relying on magic numbers: ```cpp @@ -275,19 +275,19 @@ content it depends on instead of relying on magic numbers: if (is_l2_swimlane_enabled()) { ... } // AICPU dispatch/finish timestamps. -// Granular checks below require l2_perf_aicpu_init to have already run +// Granular checks below require l2_swimlane_aicpu_init to have already run // (so the level has been promoted from the shared-memory header). -if (get_l2_perf_level() >= L2PerfLevel::AICPU_TIMING) { ... } +if (get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { ... } // Scheduler main-loop phase records (SCHED_*) -if (get_l2_perf_level() >= L2PerfLevel::SCHED_PHASES) { ... } +if (get_l2_swimlane_level() >= L2SwimlaneLevel::SCHED_PHASES) { ... } // Orchestrator phase records -if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { ... } +if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { ... } ``` -`L2PerfLevel` is defined in `common/l2_perf_profiling.h` with -underlying type `uint32_t` (matches the `L2PerfDataHeader::l2_perf_level` +`L2SwimlaneLevel` is defined in `common/l2_swimlane_profiling.h` with +underlying type `uint32_t` (matches the `L2SwimlaneDataHeader::l2_swimlane_level` shared-memory field and mirrors `PmuEventType : uint32_t`): | Enumerator | Underlying value | diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h index daef4dfdd..2ea3d5768 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h @@ -25,7 +25,7 @@ * extra I/O and an extra file in the output directory. * * deps.json is the sole source of truth for fanout: the L2 swimlane hot - * path no longer records ``L2PerfRecord::fanout[]`` (taking the per-task + * path no longer records ``L2SwimlaneAicpuTaskRecord::fanout[]`` (taking the per-task * 1 KB GM store off the scheduler critical path). Replay sees every * submit and reconstructs the complete dependency graph. * diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 6ac98b8db..70a1cacde 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -57,7 +57,7 @@ static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot // link these no-op stubs so the runtime translation unit is self-contained. // Visibility is hidden so the HOST .so doesn't export them into the global // dynamic symbol table where they'd shadow the AICPU .so's strong symbols -// (same pattern as get_sys_cnt_aicpu / l2_perf_aicpu_record_orch_phase below). +// (same pattern as get_sys_cnt_aicpu / l2_swimlane_aicpu_record_orch_phase below). extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; } __attribute__((weak, visibility("hidden"))) void dep_gen_aicpu_record_submit(uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *) {} @@ -73,7 +73,7 @@ extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabl // ============================================================================= #if PTO2_ORCH_PROFILING #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" // Weak fallback for builds that don't link device_time.cpp (e.g. host). // The strong symbol from platform/.../device_time.cpp wins in the AICPU build. // @@ -86,11 +86,11 @@ extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabl // so the AICPU .so's PLT resolves to its own strong definition from // device_time.cpp. __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } -// Weak fallback for builds that don't link l2_perf_collector_aicpu.cpp. +// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp. // The strong symbol from the AICPU build wins when profiling is available. // Also hidden to prevent HOST .so from polluting the global symbol table. __attribute__((weak, visibility("hidden"))) void -l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} +l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} // Accumulated cycles per sub-step (only needed for ORCH_PROFILING export) static uint64_t g_orch_sync_cycle = 0; // tensormap sync static uint64_t g_orch_alloc_cycle = 0; // unified task+heap alloc @@ -116,9 +116,9 @@ uint64_t g_orch_scope_end_atomic_count = 0; // in favour of the cumulatives + per-submit envelope; the dispatcher // already inserts one record at the end of each submit path via // CYCLE_COUNT_ORCH_SUBMIT_RECORD. -#define CYCLE_COUNT_START() \ - bool _prof_active = (orch->l2_perf_level >= L2PerfLevel::ORCH_PHASES); \ - uint64_t _t0 = get_sys_cnt_aicpu(), _t1; \ +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \ + uint64_t _t0 = get_sys_cnt_aicpu(), _t1; \ uint64_t _submit_start_ts = _t0 #define CYCLE_COUNT_LAP(acc) \ do { \ @@ -126,37 +126,37 @@ uint64_t g_orch_scope_end_atomic_count = 0; acc += (_t1 - _t0); \ _t0 = _t1; \ } while (0) -#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) \ - do { \ - if (_prof_active) { \ - l2_perf_aicpu_record_orch_phase( \ - AicpuPhaseId::ORCH_SUBMIT, _submit_start_ts, _t1, g_orch_submit_idx, (tid) \ - ); \ - } \ +#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) \ + do { \ + if (_prof_active) { \ + l2_swimlane_aicpu_record_orch_phase( \ + L2SwimlaneAicpuPhaseId::ORCH_SUBMIT, _submit_start_ts, _t1, g_orch_submit_idx, (tid) \ + ); \ + } \ } while (0) #elif PTO2_PROFILING #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } __attribute__((weak, visibility("hidden"))) void -l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} +l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} // submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level) static uint32_t g_orch_submit_idx = 0; -#define CYCLE_COUNT_START() \ - bool _prof_active = (orch->l2_perf_level >= L2PerfLevel::ORCH_PHASES); \ - uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0; \ +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \ + uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0; \ uint64_t _submit_start_ts = _t0 #define CYCLE_COUNT_LAP(acc) \ do { \ } while (0) -#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) \ - do { \ - if (_prof_active) { \ - _t1 = get_sys_cnt_aicpu(); \ - l2_perf_aicpu_record_orch_phase( \ - AicpuPhaseId::ORCH_SUBMIT, _submit_start_ts, _t1, g_orch_submit_idx, (tid) \ - ); \ - } \ +#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) \ + do { \ + if (_prof_active) { \ + _t1 = get_sys_cnt_aicpu(); \ + l2_swimlane_aicpu_record_orch_phase( \ + L2SwimlaneAicpuPhaseId::ORCH_SUBMIT, _submit_start_ts, _t1, g_orch_submit_idx, (tid) \ + ); \ + } \ } while (0) #else #define CYCLE_COUNT_START() diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 7dd47b19a..ff905c16d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -28,7 +28,7 @@ #ifndef PTO_ORCHESTRATOR_H #define PTO_ORCHESTRATOR_H -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "device_arena.h" #include "pto_ring_buffer.h" #include "pto_runtime2_types.h" @@ -93,8 +93,8 @@ struct PTO2OrchestratorState { int32_t total_cluster_count{0}; // AIC cores = MIX clusters int32_t total_aiv_count{0}; // AIV cores (= 2 × clusters on standard hardware) #if PTO2_PROFILING - // L2 perf_level copied from get_l2_perf_level(). - L2PerfLevel l2_perf_level{L2PerfLevel::DISABLED}; + // L2 swimlane_level copied from get_l2_swimlane_level(). + L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED}; #endif // === GM HEAP (for output buffers) === diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index a829fecd0..c6bbd0395 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -37,7 +37,7 @@ #include #include "common/core_type.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "pto2_dispatch_payload.h" #include "task_args.h" diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index f072d012c..88c8a749b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -15,11 +15,11 @@ #include "common/unified_log.h" #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/platform_regs.h" #include "aicpu/pmu_collector_aicpu.h" #include "common/memory_barrier.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "pto_runtime2.h" #include "pto_shared_memory.h" @@ -377,30 +377,32 @@ int32_t SchedulerContext::handle_timeout_exit( } #if PTO2_PROFILING -void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_thread_completed) { - auto &l2_perf = sched_l2_perf_[thread_idx]; +void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) { + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; uint64_t sched_end_ts = get_sys_cnt_aicpu(); LOG_INFO_V9( "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx, - static_cast(l2_perf.sched_start_ts), static_cast(sched_end_ts), - cycles_to_us(sched_end_ts - l2_perf.sched_start_ts) + static_cast(l2_swimlane.sched_start_ts), static_cast(sched_end_ts), + cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts) ); - uint64_t sched_total = l2_perf.sched_wiring_cycle + l2_perf.sched_complete_cycle + l2_perf.sched_dispatch_cycle + - l2_perf.sched_idle_cycle; + uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle + + l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle; if (sched_total == 0) sched_total = 1; #if PTO2_SCHED_PROFILING { PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx); uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle; - uint64_t complete_poll = (l2_perf.sched_complete_cycle > otc_total + l2_perf.sched_complete_perf_cycle) ? - (l2_perf.sched_complete_cycle - otc_total - l2_perf.sched_complete_perf_cycle) : - 0; - uint64_t dispatch_poll = - (l2_perf.sched_dispatch_cycle > l2_perf.sched_dispatch_pop_cycle + l2_perf.sched_dispatch_setup_cycle) ? - (l2_perf.sched_dispatch_cycle - l2_perf.sched_dispatch_pop_cycle - l2_perf.sched_dispatch_setup_cycle) : + uint64_t complete_poll = + (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ? + (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) : 0; + uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle > + l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ? + (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle - + l2_swimlane.sched_dispatch_setup_cycle) : + 0; LOG_INFO_V9( "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx, @@ -411,20 +413,21 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges // × core_to_thread). LOG_INFO_V9( - "Thread %d: complete : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_complete_cycle), - l2_perf.sched_complete_cycle * 100.0 / sched_total + "Thread %d: complete : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle), + l2_swimlane.sched_complete_cycle * 100.0 / sched_total ); - uint64_t c_parent = l2_perf.sched_complete_cycle > 0 ? l2_perf.sched_complete_cycle : 1; - uint64_t complete_miss_count = (l2_perf.complete_probe_count > l2_perf.complete_hit_count) ? - (l2_perf.complete_probe_count - l2_perf.complete_hit_count) : + uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1; + uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ? + (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) : 0; - double complete_hit_rate = - l2_perf.complete_probe_count > 0 ? l2_perf.complete_hit_count * 100.0 / l2_perf.complete_probe_count : 0.0; + double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ? + l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count : + 0.0; LOG_INFO_V9( "Thread %d: poll : %.3fus (%.1f%%) hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%", thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent, - static_cast(l2_perf.complete_hit_count), static_cast(complete_miss_count), + static_cast(l2_swimlane.complete_hit_count), static_cast(complete_miss_count), complete_hit_rate ); LOG_INFO_V9( @@ -451,7 +454,8 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa ); LOG_INFO_V9( "Thread %d: perf : %.3fus (%.1f%%)", thread_idx, - cycles_to_us(l2_perf.sched_complete_perf_cycle), l2_perf.sched_complete_perf_cycle * 100.0 / c_parent + cycles_to_us(l2_swimlane.sched_complete_perf_cycle), + l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent ); // pop_hit / pop_miss per-emit deltas live in each dispatch-phase @@ -459,65 +463,67 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa // the run-cumulative tracked in this struct (final-drain emit covers // the trailing-idle tail). LOG_INFO_V9( - "Thread %d: dispatch : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_dispatch_cycle), - l2_perf.sched_dispatch_cycle * 100.0 / sched_total + "Thread %d: dispatch : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle), + l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total ); - uint64_t global_dispatch_count = l2_perf.pop_hit - l2_perf.local_dispatch_count; - uint64_t total_dispatched = l2_perf.local_dispatch_count + global_dispatch_count; - double local_hit_rate = total_dispatched > 0 ? l2_perf.local_dispatch_count * 100.0 / total_dispatched : 0.0; + uint64_t global_dispatch_count = l2_swimlane.pop_hit - l2_swimlane.local_dispatch_count; + uint64_t total_dispatched = l2_swimlane.local_dispatch_count + global_dispatch_count; + double local_hit_rate = + total_dispatched > 0 ? l2_swimlane.local_dispatch_count * 100.0 / total_dispatched : 0.0; LOG_INFO_V9( "Thread %d: local_disp : local=%" PRIu64 ", global=%" PRIu64 ", overflow=%" PRIu64 ", local_rate=%.1f%%", - thread_idx, static_cast(l2_perf.local_dispatch_count), - static_cast(global_dispatch_count), static_cast(l2_perf.local_overflow_count), + thread_idx, static_cast(l2_swimlane.local_dispatch_count), + static_cast(global_dispatch_count), static_cast(l2_swimlane.local_overflow_count), local_hit_rate ); - uint64_t d_parent = l2_perf.sched_dispatch_cycle > 0 ? l2_perf.sched_dispatch_cycle : 1; + uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1; LOG_INFO_V9( "Thread %d: poll : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll), dispatch_poll * 100.0 / d_parent ); LOG_INFO_V9( "Thread %d: pop : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, - cycles_to_us(l2_perf.sched_dispatch_pop_cycle), l2_perf.sched_dispatch_pop_cycle * 100.0 / d_parent, - cycles_to_us(l2_perf.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle), + cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent, + cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle), static_cast(sp.pop_atomic_count) ); LOG_INFO_V9( "Thread %d: setup : %.3fus (%.1f%%)", thread_idx, - cycles_to_us(l2_perf.sched_dispatch_setup_cycle), l2_perf.sched_dispatch_setup_cycle * 100.0 / d_parent + cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle), + l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent ); #if PTO2_SCHED_PROFILING LOG_INFO_V9( "Thread %d: wiring : %.3fus (%.1f%%) tasks=%d", thread_idx, - cycles_to_us(l2_perf.sched_wiring_cycle), l2_perf.sched_wiring_cycle * 100.0 / sched_total, - l2_perf.phase_wiring_count + cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total, + l2_swimlane.phase_wiring_count ); #else LOG_INFO_V9( - "Thread %d: wiring : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_wiring_cycle), - l2_perf.sched_wiring_cycle * 100.0 / sched_total + "Thread %d: wiring : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle), + l2_swimlane.sched_wiring_cycle * 100.0 / sched_total ); #endif LOG_INFO_V9( - "Thread %d: idle : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_idle_cycle), - l2_perf.sched_idle_cycle * 100.0 / sched_total + "Thread %d: idle : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle), + l2_swimlane.sched_idle_cycle * 100.0 / sched_total ); if (cur_thread_completed > 0) { LOG_INFO_V9( "Thread %d: avg/complete : %.3fus", thread_idx, - cycles_to_us(l2_perf.sched_complete_cycle) / cur_thread_completed + cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed ); } } #endif LOG_INFO_V9( "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx, - cycles_to_us(sched_total), static_cast(l2_perf.sched_loop_count), cur_thread_completed + cycles_to_us(sched_total), static_cast(l2_swimlane.sched_loop_count), cur_thread_completed ); } #endif @@ -832,15 +838,15 @@ int32_t SchedulerContext::init( regs_ = regs_base; #if PTO2_PROFILING - // l2_perf_aicpu_init promotes g_l2_perf_level from the shared-memory + // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory // header — must be called BEFORE caching the level, otherwise the cached // value would still be 0 (only the binary enable bit has been seeded by // kernel.cpp at this point). if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_init(runtime->worker_count); - l2_perf_level_ = get_l2_perf_level(); - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - l2_perf_aicpu_init_phase(runtime->worker_count, sched_thread_num_); + l2_swimlane_aicpu_init(runtime->worker_count); + l2_swimlane_level_ = get_l2_swimlane_level(); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_thread_num_); } } #endif @@ -965,9 +971,9 @@ void SchedulerContext::on_orchestration_done( Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks ) { #if PTO2_PROFILING - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { // Flush orchestrator's phase record buffer - l2_perf_aicpu_flush_phase_buffers(thread_idx); + l2_swimlane_aicpu_flush_phase_buffers(thread_idx); } #endif @@ -1020,10 +1026,10 @@ void SchedulerContext::on_orchestration_done( // Write core-to-thread mapping AFTER reassignment so the profiling data // reflects the final distribution (all active_sched_threads_, including // former orchestrator threads when orch_to_sched_ is enabled). - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - l2_perf_aicpu_init_core_assignments(cores_total_num_); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_init_core_assignments(cores_total_num_); for (int32_t t = 0; t < active_sched_threads_; t++) { - l2_perf_aicpu_write_core_assignments_for_thread( + l2_swimlane_aicpu_write_core_assignments_for_thread( t, core_trackers_[t].core_ids(), core_trackers_[t].core_num() ); } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp index ea6ab5e01..b18091841 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp @@ -13,7 +13,7 @@ #include "common/unified_log.h" #include "aicpu/device_time.h" #include "aicpu/platform_regs.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "pto_runtime2.h" @@ -21,7 +21,7 @@ #include "spin_hint.h" // Performance profiling headers -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/pmu_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" @@ -77,7 +77,7 @@ void SchedulerContext::complete_slot_task( #endif ) { #if PTO2_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #else (void)hank; #endif @@ -130,7 +130,7 @@ void SchedulerContext::complete_slot_task( sched_->on_mixed_task_complete(slot_state, local_bufs); #endif #if PTO2_PROFILING - l2_perf.phase_complete_count++; + l2_swimlane.phase_complete_count++; #endif if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) { deferred_release_slot_states[deferred_release_count++] = &slot_state; @@ -151,24 +151,24 @@ void SchedulerContext::complete_slot_task( } #if PTO2_PROFILING - if (l2_perf.l2_perf_enabled) { + if (l2_swimlane.l2_swimlane_enabled) { #if PTO2_SCHED_PROFILING uint64_t t_perf_start = get_sys_cnt_aicpu(); #endif - uint64_t finish_ts = (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; + uint64_t finish_ts = (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; int32_t perf_slot_idx = static_cast(subslot); - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(expected_reg_task_id), slot_state.task->task_id.raw, slot_state.task->kernel_id[perf_slot_idx], hank[core_id].core_type, dispatch_ts, finish_ts ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id, + "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id, static_cast(slot_state.task->task_id.raw) ); } #if PTO2_SCHED_PROFILING - l2_perf.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start); + l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start); #endif } @@ -205,7 +205,7 @@ void SchedulerContext::check_running_cores_for_completion( PTO2LocalReadyBuffer *local_bufs ) { #if PTO2_SCHED_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #endif CoreTracker &tracker = core_trackers_[thread_idx]; auto running_core_states = tracker.get_all_running_cores(); @@ -227,8 +227,8 @@ void SchedulerContext::check_running_cores_for_completion( int32_t reg_state = EXTRACT_TASK_STATE(reg_val); #if PTO2_SCHED_PROFILING - if (l2_perf.l2_perf_enabled) { - l2_perf.complete_probe_count++; + if (l2_swimlane.l2_swimlane_enabled) { + l2_swimlane.complete_probe_count++; } #endif @@ -237,8 +237,8 @@ void SchedulerContext::check_running_cores_for_completion( if (!t.matched) continue; #if PTO2_SCHED_PROFILING - if (l2_perf.l2_perf_enabled && (t.running_done || t.pending_done)) { - l2_perf.complete_hit_count++; + if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) { + l2_swimlane.complete_hit_count++; } #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h index 7ed8c6bb0..7886fb6bc 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h @@ -11,7 +11,7 @@ #ifndef SCHEDULER_CONTEXT_H #define SCHEDULER_CONTEXT_H -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/unified_log.h" #include "scheduler_types.h" @@ -135,10 +135,10 @@ class SchedulerContext { SyncStartDrainState drain_state_; #if PTO2_PROFILING - SchedL2PerfCounters sched_l2_perf_[MAX_AICPU_THREADS]; - // Cached once at init() from get_l2_perf_level(), AFTER - // l2_perf_aicpu_init has promoted the level from the shared-memory header. - L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; + SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS]; + // Cached once at init() from get_l2_swimlane_level(), AFTER + // l2_swimlane_aicpu_init has promoted the level from the shared-memory header. + L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED}; #endif // --- Task-execution tracking --- @@ -349,7 +349,7 @@ class SchedulerContext { ); #if PTO2_PROFILING - __attribute__((noinline, cold)) void log_l2_perf_summary(int32_t thread_idx, int32_t cur_thread_completed); + __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed); #endif // ========================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index 0239ea218..3d5d95540 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -19,7 +19,7 @@ #include "aicpu/device_time.h" #include "aicpu/platform_regs.h" #include "callable.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "pto_runtime2.h" @@ -27,7 +27,7 @@ #include "spin_hint.h" // Performance profiling headers -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/pmu_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" @@ -79,15 +79,15 @@ int SchedulerContext::pop_ready_tasks_batch( PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count ) { #if PTO2_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #if PTO2_SCHED_PROFILING extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[]; uint64_t t_pop_start = get_sys_cnt_aicpu(); int count = sched_->get_ready_tasks_batch( shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx], - l2_perf.local_dispatch_count + l2_swimlane.local_dispatch_count ); - l2_perf.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start); + l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start); #else int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count); #endif @@ -95,9 +95,9 @@ int SchedulerContext::pop_ready_tasks_batch( // so dispatch-phase records in aicpu_scheduler_phases[] carry queue-health // stats on default builds. if (count > 0) { - l2_perf.pop_hit += count; + l2_swimlane.pop_hit += count; } else { - l2_perf.pop_miss++; + l2_swimlane.pop_miss++; } #else (void)thread_idx; @@ -159,7 +159,7 @@ void SchedulerContext::dispatch_subtask_to_core( core_exec_state.pending_slot_state = &slot_state; core_exec_state.pending_reg_task_id = static_cast(reg_task_id); #if PTO2_PROFILING - if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { core_exec_state.pending_dispatch_timestamp = get_sys_cnt_aicpu(); } #endif @@ -168,7 +168,7 @@ void SchedulerContext::dispatch_subtask_to_core( core_exec_state.running_slot_state = &slot_state; core_exec_state.running_reg_task_id = static_cast(reg_task_id); #if PTO2_PROFILING - if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { core_exec_state.running_dispatch_timestamp = get_sys_cnt_aicpu(); } #endif @@ -246,7 +246,7 @@ void SchedulerContext::dispatch_block( dispatch_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx); } #if PTO2_PROFILING - sched_l2_perf_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask()); + sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask()); #endif } @@ -255,7 +255,7 @@ void SchedulerContext::dispatch_shape( CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed ) { #if PTO2_SCHED_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #endif if (entered_drain) return; @@ -323,7 +323,7 @@ void SchedulerContext::dispatch_shape( } made_progress = true; #if PTO2_SCHED_PROFILING - l2_perf.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); + l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); #endif } @@ -352,7 +352,7 @@ void SchedulerContext::dispatch_ready_tasks( const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1]; #if PTO2_SCHED_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #endif // Note: flush_local_bufs is invoked multiple times per pass (mid-function @@ -366,7 +366,7 @@ void SchedulerContext::dispatch_ready_tasks( for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) { auto &lb = local_bufs[s]; #if PTO2_SCHED_PROFILING - l2_perf.local_overflow_count += lb.count; + l2_swimlane.local_overflow_count += lb.count; #endif if (lb.count > 0) { sched_->ready_queues[s].push_batch(lb.slot_states, lb.count); @@ -511,9 +511,9 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ int32_t idle_iterations = 0; int32_t last_progress_count = 0; #if PTO2_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; - l2_perf.reset(); - l2_perf.l2_perf_enabled = (l2_perf_level_ != L2PerfLevel::DISABLED); + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; + l2_swimlane.reset(); + l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED); #endif constexpr int LOCAL_READY_CAP_PER_TYPE = 64; @@ -534,7 +534,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ const bool pmu_active = is_pmu_enabled(); #if PTO2_PROFILING - l2_perf.sched_start_ts = get_sys_cnt_aicpu(); + l2_swimlane.sched_start_ts = get_sys_cnt_aicpu(); #endif while (true) { @@ -544,7 +544,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ bool made_progress = false; #if PTO2_PROFILING CYCLE_COUNT_START(); - l2_perf.sched_loop_count++; + l2_swimlane.sched_loop_count++; uint64_t _t0_phase = _t0; #endif int32_t task_count = 0; @@ -559,7 +559,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ } #if PTO2_PROFILING - CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle); + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); #endif // Phase 1: Check running cores for completion @@ -621,16 +621,16 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ #if PTO2_PROFILING if (!try_completed) { - CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle); + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); } else { - CYCLE_COUNT_LAP(l2_perf.sched_complete_cycle); - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES && l2_perf.phase_complete_count > 0) { - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, l2_perf.sched_loop_count, - l2_perf.phase_complete_count + CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_complete_count > 0) { + l2_swimlane_aicpu_record_phase( + thread_idx, L2SwimlaneAicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, l2_swimlane.sched_loop_count, + l2_swimlane.phase_complete_count ); _t0_phase = _t1; - l2_perf.phase_complete_count = 0; + l2_swimlane.phase_complete_count = 0; } } #endif @@ -649,12 +649,12 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ if (wired > 0) { made_progress = true; #if PTO2_SCHED_PROFILING - l2_perf.phase_wiring_count += wired; + l2_swimlane.phase_wiring_count += wired; #endif } } #if PTO2_PROFILING - CYCLE_COUNT_LAP(l2_perf.sched_wiring_cycle); + CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle); #endif // Phase 3b: Drain dummy ready queue (thread 0 only). @@ -704,28 +704,28 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ #if PTO2_PROFILING if (!try_pushed) { - CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle); + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); } else { - CYCLE_COUNT_LAP(l2_perf.sched_dispatch_cycle); - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES && l2_perf.phase_dispatch_count > 0) { + CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_dispatch_count > 0) { // Per-emit pop deltas via snapshot diff; the cumulative // pop_hit / pop_miss stay intact for the cold-path log. - uint64_t pop_hit_delta = l2_perf.pop_hit - l2_perf.pop_hit_at_last_emit; - uint64_t pop_miss_delta = l2_perf.pop_miss - l2_perf.pop_miss_at_last_emit; - // AicpuPhaseRecord's extras are uint32 — a delta that overflows means + uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit; + uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit; + // L2SwimlaneAicpuPhaseRecord's extras are uint32 — a delta that overflows means // an emit was missed for ~4 billion pops, which is well outside any // realistic dispatch cadence and silently truncates without this guard. debug_assert(pop_hit_delta < (1ULL << 32)); debug_assert(pop_miss_delta < (1ULL << 32)); - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, l2_perf.sched_loop_count, - l2_perf.phase_dispatch_count, static_cast(pop_hit_delta), + l2_swimlane_aicpu_record_phase( + thread_idx, L2SwimlaneAicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, l2_swimlane.sched_loop_count, + l2_swimlane.phase_dispatch_count, static_cast(pop_hit_delta), static_cast(pop_miss_delta) ); _t0_phase = _t1; - l2_perf.phase_dispatch_count = 0; - l2_perf.pop_hit_at_last_emit = l2_perf.pop_hit; - l2_perf.pop_miss_at_last_emit = l2_perf.pop_miss; + l2_swimlane.phase_dispatch_count = 0; + l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit; + l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss; } } #endif @@ -760,21 +760,21 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ thread_idx, header, runtime, idle_iterations, last_progress_count #if PTO2_PROFILING , - l2_perf.sched_start_ts + l2_swimlane.sched_start_ts #endif ); } else { SPIN_WAIT_HINT(); } #if PTO2_PROFILING - CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle); + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); // Idle iterations no longer emit a phase record. Host tooling // recovers idle spans from the gap between consecutive sched // phase records on the same thread. _t0_phase still advances // so the next emitted COMPLETE/DISPATCH gets the correct // start_time (the iter it actually ran in), not the start of // the preceding idle stretch. - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { _t0_phase = _t1; } #endif @@ -801,31 +801,31 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // sum(record.pop_*) reconciles with the run-cumulative counter. // Gate on SCHED_PHASES — at lower levels the phase buffer is never // flushed (see below), so writing this record would be wasted work. - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - uint64_t final_pop_hit_delta = l2_perf.pop_hit - l2_perf.pop_hit_at_last_emit; - uint64_t final_pop_miss_delta = l2_perf.pop_miss - l2_perf.pop_miss_at_last_emit; + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit; + uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit; debug_assert(final_pop_hit_delta < (1ULL << 32)); debug_assert(final_pop_miss_delta < (1ULL << 32)); if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) { uint64_t t_now = get_sys_cnt_aicpu(); - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_DISPATCH, t_now, t_now, l2_perf.sched_loop_count, 0, + l2_swimlane_aicpu_record_phase( + thread_idx, L2SwimlaneAicpuPhaseId::SCHED_DISPATCH, t_now, t_now, l2_swimlane.sched_loop_count, 0, static_cast(final_pop_hit_delta), static_cast(final_pop_miss_delta) ); - l2_perf.pop_hit_at_last_emit = l2_perf.pop_hit; - l2_perf.pop_miss_at_last_emit = l2_perf.pop_miss; + l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit; + l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss; } } - log_l2_perf_summary(thread_idx, cur_thread_completed); + log_l2_swimlane_summary(thread_idx, cur_thread_completed); #endif #if PTO2_PROFILING - if (l2_perf.l2_perf_enabled) { - l2_perf_aicpu_flush_buffers( + if (l2_swimlane.l2_swimlane_enabled) { + l2_swimlane_aicpu_flush( thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num() ); - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - l2_perf_aicpu_flush_phase_buffers(thread_idx); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_flush_phase_buffers(thread_idx); } } #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h index a73e1b0b9..00ceef76d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h @@ -341,8 +341,8 @@ struct SlotTransition { // ============================================================================= #if PTO2_PROFILING -struct alignas(64) SchedL2PerfCounters { - bool l2_perf_enabled{false}; +struct alignas(64) SchedL2SwimlaneCounters { + bool l2_swimlane_enabled{false}; uint64_t sched_start_ts{0}; uint64_t sched_complete_cycle{0}; uint64_t sched_dispatch_cycle{0}; @@ -369,7 +369,7 @@ struct alignas(64) SchedL2PerfCounters { uint64_t sched_dispatch_pop_cycle{0}; uint64_t sched_dispatch_setup_cycle{0}; #endif - void reset() { *this = SchedL2PerfCounters{}; } + void reset() { *this = SchedL2SwimlaneCounters{}; } }; #endif diff --git a/src/a5/platform/include/aicore/aicore_profiling_state.h b/src/a5/platform/include/aicore/aicore_profiling_state.h index 53acce990..70c062a88 100644 --- a/src/a5/platform/include/aicore/aicore_profiling_state.h +++ b/src/a5/platform/include/aicore/aicore_profiling_state.h @@ -25,7 +25,7 @@ * * Lifecycle: * 1. Host fills `KernelArgs::enable_profiling_flag`, the two per-core - * ring address arrays (`aicore_l2_perf_ring_addrs`, + * ring address arrays (`aicore_l2_swimlane_ring_addrs`, * `aicore_pmu_ring_addrs`), and `regs` (the per-physical-core * register-base array — already required for AICPU). * 2. AICore kernel entry indexes the ring arrays by `block_idx` and @@ -44,7 +44,7 @@ #include #include "aicore/aicore.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/pmu_profiling.h" /** @@ -56,12 +56,12 @@ __aicore__ void set_aicore_profiling_flag(uint32_t flag); __aicore__ uint32_t get_aicore_profiling_flag(); /** - * Per-core L2Perf staging ring. Set once at kernel entry from - * `((__gm__ uint64_t*)k_args->aicore_l2_perf_ring_addrs)[block_idx]`; + * Per-core L2Swimlane staging ring. Set once at kernel entry from + * `((__gm__ uint64_t*)k_args->aicore_l2_swimlane_ring_addrs)[block_idx]`; * nullptr when the L2 swimlane bit is off or the address table is null. */ -__aicore__ void set_aicore_l2_perf_ring(__gm__ L2PerfAicoreRing *ring); -__aicore__ __gm__ L2PerfAicoreRing *get_aicore_l2_perf_ring(); +__aicore__ void set_aicore_l2_swimlane_ring(__gm__ L2SwimlaneAicoreRing *ring); +__aicore__ __gm__ L2SwimlaneAicoreRing *get_aicore_l2_swimlane_ring(); /** * Per-core PMU staging ring (a5-only — AICore writes the snapshot). diff --git a/src/a5/platform/include/aicore/l2_perf_collector_aicore.h b/src/a5/platform/include/aicore/l2_swimlane_collector_aicore.h similarity index 73% rename from src/a5/platform/include/aicore/l2_perf_collector_aicore.h rename to src/a5/platform/include/aicore/l2_swimlane_collector_aicore.h index b10ebb32f..2bdfceaa5 100644 --- a/src/a5/platform/include/aicore/l2_perf_collector_aicore.h +++ b/src/a5/platform/include/aicore/l2_swimlane_collector_aicore.h @@ -9,18 +9,18 @@ * ----------------------------------------------------------------------------------------------------------- */ /** - * @file l2_perf_collector_aicore.h + * @file l2_swimlane_collector_aicore.h * @brief AICore performance data collection interface * * Provides lightweight performance recording interface for AICore kernels. * Uses dcci for efficient cache management instead of memory barriers. */ -#ifndef PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_ -#define PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_ +#ifndef PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_ +#define PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_ #include "aicore/aicore.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" // Include platform-specific timestamp implementation @@ -35,29 +35,30 @@ /** * Record task execution performance data * - * Writes timing metrics to the per-core L2PerfAicoreRing slot + * Writes timing metrics to the per-core L2SwimlaneAicoreRing slot * (`dual_issue_slots[task_id % PLATFORM_L2_AICORE_RING_SIZE]`). The * ring is allocated once by the host and never reassigned, so AICore writes * to a stable address regardless of AICPU buffer rotations. AICPU reads the - * slot in `l2_perf_aicpu_complete_record` and commits the record into the - * rotating L2PerfBuffer. + * slot in `l2_swimlane_aicpu_complete_task` and commits the record into the + * rotating L2SwimlaneAicpuTaskBuffer. * - * AICore writes L2PerfRecord.task_id as the register dispatch token (low 32 bits, zero-extended). + * AICore writes L2SwimlaneAicpuTaskRecord.task_id as the register dispatch token (low 32 bits, zero-extended). * For tensormap_and_ringbuffer, AICPU overwrites with the full (ring_id << 32) | local_id * encoding after handshake match. * - * @param ring Per-core L2PerfAicoreRing pointer (from get_aicore_l2_perf_ring()) + * @param ring Per-core L2SwimlaneAicoreRing pointer (from get_aicore_l2_swimlane_ring()) * @param task_id Register dispatch id (DATA_MAIN_BASE), stored in task_id low 32 bits * @param start_time Start timestamp * @param end_time End timestamp */ -__aicore__ __attribute__((always_inline)) static inline void -l2_perf_aicore_record_task(__gm__ L2PerfAicoreRing *ring, uint32_t task_id, uint64_t start_time, uint64_t end_time) { +__aicore__ __attribute__((always_inline)) static inline void l2_swimlane_aicore_record_task( + __gm__ L2SwimlaneAicoreRing *ring, uint32_t task_id, uint64_t start_time, uint64_t end_time +) { // Modulo-indexed slot. PLATFORM_L2_AICORE_RING_SIZE is conventionally a // power of two so the compiler reduces this to a mask, but using `%` // keeps the index correct if the ring size is ever retuned to a // non-power-of-two value (matches the a2a3 convention). - __gm__ L2PerfRecord *record = &ring->dual_issue_slots[task_id % PLATFORM_L2_AICORE_RING_SIZE]; + __gm__ L2SwimlaneAicpuTaskRecord *record = &ring->dual_issue_slots[task_id % PLATFORM_L2_AICORE_RING_SIZE]; record->start_time = start_time; record->end_time = end_time; @@ -71,4 +72,4 @@ l2_perf_aicore_record_task(__gm__ L2PerfAicoreRing *ring, uint32_t task_id, uint dsb((mem_dsb_t)0); } -#endif // PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_ +#endif // PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_ diff --git a/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h similarity index 67% rename from src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h rename to src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h index fac7a691c..e3680fd17 100644 --- a/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h +++ b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h @@ -9,18 +9,18 @@ * ----------------------------------------------------------------------------------------------------------- */ /** - * @file l2_perf_collector_aicpu.h + * @file l2_swimlane_collector_aicpu.h * @brief AICPU performance data collection interface * * Provides performance profiling management interface for AICPU side. * Handles buffer initialization, switching, and flushing. */ -#ifndef PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_ -#define PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_ +#ifndef PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_ +#define PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_ #include "common/core_type.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" // Include platform-specific timestamp implementation // Build system selects the correct inner_aicpu.h based on platform: @@ -30,51 +30,51 @@ // ============= Public Interface ============= /** - * L2 perf platform setters — called by the host (sim) or the AICPU kernel - * entry (onboard) before `l2_perf_aicpu_init()` so AICPU code can read perf + * L2 swimlane platform setters — called by the host (sim) or the AICPU kernel + * entry (onboard) before `l2_swimlane_aicpu_init()` so AICPU code can read perf * state without reaching into the generic `Runtime` struct. * * Two-channel level transport (mirrors the PMU pattern): * - binary on/off — `enable_profiling_flag` bit1 → `set_l2_swimlane_enabled(bool)` * at kernel entry; queried via `is_l2_swimlane_enabled()`. - * - granular L2PerfLevel — `L2PerfDataHeader::l2_perf_level` (shared memory); - * read in `l2_perf_aicpu_init` and cached, then queried via - * `get_l2_perf_level()` for `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates. + * - granular L2SwimlaneLevel — `L2SwimlaneDataHeader::l2_swimlane_level` (shared memory); + * read in `l2_swimlane_aicpu_init` and cached, then queried via + * `get_l2_swimlane_level()` for `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates. */ -extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base); -extern "C" uint64_t get_platform_l2_perf_base(); +extern "C" void set_platform_l2_swimlane_base(uint64_t l2_swimlane_data_base); +extern "C" uint64_t get_platform_l2_swimlane_base(); extern "C" void set_l2_swimlane_enabled(bool enable); extern "C" bool is_l2_swimlane_enabled(); // Typed getter for the granular perf_level (promoted from the shared-memory -// header inside l2_perf_aicpu_init). Gate sites should use this so the -// comparison RHS is a named L2PerfLevel constant. -L2PerfLevel get_l2_perf_level(); +// header inside l2_swimlane_aicpu_init). Gate sites should use this so the +// comparison RHS is a named L2SwimlaneLevel constant. +L2SwimlaneLevel get_l2_swimlane_level(); /** * Initialize performance profiling for `worker_count` cores. * * Caches per-core BufferState (including stable AICore staging-ring - * pointers `state.aicore_ring_ptr`) and pops the initial L2PerfBuffer from + * pointers `state.aicore_ring_ptr`) and pops the initial L2SwimlaneAicpuTaskBuffer from * each free_queue. Reads the perf device-base pointer published via - * `set_platform_l2_perf_base()`. Does **not** write any Handshake field — + * `set_platform_l2_swimlane_base()`. Does **not** write any Handshake field — * profiling state lives in `KernelArgs` + AICore platform-owned slots. * * @param worker_count Number of active AICore workers */ -void l2_perf_aicpu_init(int worker_count); +void l2_swimlane_aicpu_init(int worker_count); /** - * Complete a L2PerfRecord with AICPU-side metadata after AICore task completion + * Complete a L2SwimlaneAicpuTaskRecord with AICPU-side metadata after AICore task completion * - * Reads from the per-core L2PerfAicoreRing dual-issue slot + * Reads from the per-core L2SwimlaneAicoreRing dual-issue slot * (`s_perf_aicore_rings[core_id]->dual_issue_slots[reg_task_id & ...]`), * validates task_id match, and commits the record into * `state->current_buf_ptr->records[count++]`. Callers must pre-extract * fanout into a plain uint64_t array (platform layer cannot depend on * runtime linked-list types). * - * @param core_id Core ID owning the destination buffer (resolved via s_perf_buffer_states) + * @param core_id Core ID owning the destination buffer (resolved via s_aicpu_task_pools) * @param thread_idx Owning AICPU thread (used when rotating records buffer) * @param expected_reg_task_id Register dispatch token (low 32 bits) to validate * @param task_id Task identifier to write (PTO2 encoding or plain id) @@ -89,7 +89,7 @@ void l2_perf_aicpu_init(int worker_count); * flush()-clearing current_buf_ptr deterministically halts subsequent commits * (they take the dropped path). Same shape as a2a3. */ -int l2_perf_aicpu_complete_record( +int l2_swimlane_aicpu_complete_task( int core_id, int thread_idx, uint32_t expected_reg_task_id, uint64_t task_id, uint32_t func_id, CoreType core_type, uint64_t dispatch_time, uint64_t finish_time, const uint64_t *fanout, int32_t fanout_count ); @@ -103,23 +103,23 @@ int l2_perf_aicpu_complete_record( * @param cur_thread_cores Array of core IDs managed by this thread * @param core_num Number of cores managed by this thread */ -void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, int core_num); +void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int core_num); /** * Initialize AICPU phase profiling * - * Sets up AicpuPhaseHeader and clears per-thread phase record buffers. - * Must be called once from thread 0 after l2_perf_aicpu_init(). + * Sets up L2SwimlaneAicpuPhaseHeader and clears per-thread phase record buffers. + * Must be called once from thread 0 after l2_swimlane_aicpu_init(). * * @param worker_count Number of AICore workers (used to locate phase region) * @param num_sched_threads Number of scheduler threads */ -void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads); +void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads); /** * Record a single scheduler phase * - * Appends an AicpuPhaseRecord to the specified thread's buffer. + * Appends an L2SwimlaneAicpuPhaseRecord to the specified thread's buffer. * When the buffer is full, switches to a new buffer via FreeQueue. * * @param thread_idx Scheduler thread index @@ -130,12 +130,12 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads); * @param tasks_processed Number of tasks processed in this batch (scheduler phases), or * full PTO2 task_id encoding (ring_id << 32) | local_id (orchestrator * phases in tensormap_and_ringbuffer) - * @param extra1, extra2 Phase-specific delta counters (see AicpuPhaseRecord doc). + * @param extra1, extra2 Phase-specific delta counters (see L2SwimlaneAicpuPhaseRecord doc). * SCHED_DISPATCH uses extra1=pop_hit, extra2=pop_miss; other * phases pass 0. */ -void l2_perf_aicpu_record_phase( - int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, +void l2_swimlane_aicpu_record_phase( + int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, uint64_t tasks_processed, uint32_t extra1 = 0, uint32_t extra2 = 0 ); @@ -143,16 +143,16 @@ void l2_perf_aicpu_record_phase( * Set orchestrator thread index for per-task phase recording * * Must be called once from the orchestrator thread before any - * l2_perf_aicpu_record_orch_phase() calls. + * l2_swimlane_aicpu_record_orch_phase() calls. * * @param thread_idx Thread index for the orchestrator (typically num_sched_threads) */ -void l2_perf_aicpu_set_orch_thread_idx(int thread_idx); +void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx); /** * Record a single orchestrator phase * - * Appends an AicpuPhaseRecord for one sub-step of submit_task(). + * Appends an L2SwimlaneAicpuPhaseRecord for one sub-step of submit_task(). * Uses the orchestrator's dedicated buffer slot (set via set_orch_thread_idx). * * @param phase_id Orchestrator phase identifier (ORCH_SYNC..ORCH_SCOPE_END) @@ -162,28 +162,28 @@ void l2_perf_aicpu_set_orch_thread_idx(int thread_idx); * @param task_id Task identifier. For tensormap_and_ringbuffer, this is the full PTO2 encoding: * (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler swimlanes. */ -void l2_perf_aicpu_record_orch_phase( - AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id +void l2_swimlane_aicpu_record_orch_phase( + L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id ); /** * Write core-to-thread assignment mapping to shared memory. * - * Callers invoke `l2_perf_aicpu_init_core_assignments(total_cores)` once, then - * `l2_perf_aicpu_write_core_assignments_for_thread(t, ids, n)` for every + * Callers invoke `l2_swimlane_aicpu_init_core_assignments(total_cores)` once, then + * `l2_swimlane_aicpu_write_core_assignments_for_thread(t, ids, n)` for every * scheduler thread. */ -void l2_perf_aicpu_init_core_assignments(int total_cores); -void l2_perf_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num); +void l2_swimlane_aicpu_init_core_assignments(int total_cores); +void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num); /** * Flush remaining phase records for a thread * * Marks the current WRITING phase buffer as READY and enqueues it - * for host collection. Called at thread exit (analogous to l2_perf_aicpu_flush_buffers). + * for host collection. Called at thread exit (analogous to l2_swimlane_aicpu_flush). * * @param thread_idx Thread index (scheduler thread or orchestrator) */ -void l2_perf_aicpu_flush_phase_buffers(int thread_idx); +void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx); -#endif // PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_ +#endif // PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_ diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h index 67d7a5f1e..34d9a08a8 100644 --- a/src/a5/platform/include/common/kernel_args.h +++ b/src/a5/platform/include/common/kernel_args.h @@ -69,19 +69,21 @@ struct KernelArgs { DeviceArgs *device_args{nullptr}; // Device arguments (AICPU reads, contains SO info) __may_used_by_aicore__ Runtime *runtime_args{nullptr}; // Task runtime in device memory uint64_t regs{0}; // Per-core register base address array (platform-specific) - uint64_t dump_data_base{0}; // Dump shared memory base address; use explicit flags to detect enablement - uint64_t l2_perf_data_base{0}; // L2 perf shared memory base address; use explicit flags to detect enablement - uint64_t pmu_data_base{0}; // PMU buffer base address (device memory); 0 = PMU disabled + uint64_t dump_data_base{0}; // Dump shared memory base address; use explicit flags to detect enablement + uint64_t l2_swimlane_data_base{ + 0 + }; // L2 swimlane shared memory base address; use explicit flags to detect enablement + uint64_t pmu_data_base{0}; // PMU buffer base address (device memory); 0 = PMU disabled // Profiling per-core address arrays (moved out of Handshake). Each *_addrs // field is a device pointer to uint64_t[num_aicore]. AICore KERNEL_ENTRY // indexes by block_idx and forwards into per-core platform state. - uint64_t aicore_l2_perf_ring_addrs{0}; // L2PerfAicoreRing* per core; 0 when L2 swimlane is off - uint64_t aicore_pmu_ring_addrs{0}; // PmuAicoreRing* per core; 0 when PMU is off - uint64_t scope_stats_data_base{0}; // ScopeStatsBuffer device pointer; 0 when scope_stats is off. - // a5 has no halHostRegister — host keeps a separate shadow and - // refreshes it via rtMemcpy DEVICE_TO_HOST at dump time. - uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL - uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 + uint64_t aicore_l2_swimlane_ring_addrs{0}; // L2SwimlaneAicoreRing* per core; 0 when L2 swimlane is off + uint64_t aicore_pmu_ring_addrs{0}; // PmuAicoreRing* per core; 0 when PMU is off + uint64_t scope_stats_data_base{0}; // ScopeStatsBuffer device pointer; 0 when scope_stats is off. + // a5 has no halHostRegister — host keeps a separate shadow and + // refreshes it via rtMemcpy DEVICE_TO_HOST at dump time. + uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL + uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 uint32_t enable_profiling_flag{0}; // Profiling umbrella bitmask; dump_tensor|l2_swimlane|pmu|dep_gen|scope_stats uint32_t _pad{0}; // Alignment padding diff --git a/src/a5/platform/include/common/l2_perf_profiling.h b/src/a5/platform/include/common/l2_swimlane_profiling.h similarity index 69% rename from src/a5/platform/include/common/l2_perf_profiling.h rename to src/a5/platform/include/common/l2_swimlane_profiling.h index e4ede5d0f..363d60330 100644 --- a/src/a5/platform/include/common/l2_perf_profiling.h +++ b/src/a5/platform/include/common/l2_swimlane_profiling.h @@ -10,49 +10,49 @@ */ /** - * @file l2_perf_profiling.h + * @file l2_swimlane_profiling.h * @brief Performance profiling data structures * * Architecture: Fixed header + per-core/thread buffer states + optional phase profiling region * * Memory layout (shared memory between Host and Device): * ┌─────────────────────────────────────────────────────────────┐ - * │ L2PerfDataHeader (fixed header) │ + * │ L2SwimlaneDataHeader (fixed header) │ * │ - ReadyQueue (FIFO, capacity=PLATFORM_PROF_READYQUEUE_SIZE)│ * │ - Metadata (num_cores, flags) │ * ├─────────────────────────────────────────────────────────────┤ - * │ L2PerfBufferState[0] (Core 0) │ + * │ L2SwimlaneAicpuTaskPool[0] (Core 0) │ * │ - free_queue: SPSC queue of available buffer pointers │ * │ - current_buf_ptr, current_buf_seq │ * ├─────────────────────────────────────────────────────────────┤ - * │ L2PerfBufferState[1] (Core 1) │ + * │ L2SwimlaneAicpuTaskPool[1] (Core 1) │ * ├─────────────────────────────────────────────────────────────┤ * │ ... │ * ├─────────────────────────────────────────────────────────────┤ - * │ L2PerfBufferState[num_cores-1] │ + * │ L2SwimlaneAicpuTaskPool[num_cores-1] │ * ├─────────────────────────────────────────────────────────────┤ - * │ AicpuPhaseHeader (optional, present when phase profiling) │ + * │ L2SwimlaneAicpuPhaseHeader (optional, present when phase profiling) │ * │ - magic, num_sched_threads, records_per_thread │ * │ - core_to_thread mapping │ * ├─────────────────────────────────────────────────────────────┤ - * │ PhaseBufferState[thread0] │ + * │ L2SwimlaneAicpuPhasePool[thread0] │ * │ - free_queue: SPSC queue of available buffer pointers │ * │ - current_buf_ptr, current_buf_seq │ * ├─────────────────────────────────────────────────────────────┤ - * │ PhaseBufferState[thread1] │ + * │ L2SwimlaneAicpuPhasePool[thread1] │ * ├─────────────────────────────────────────────────────────────┤ * │ ... │ * └─────────────────────────────────────────────────────────────┘ * - * Actual L2PerfBuffer / PhaseBuffer are allocated dynamically by Host + * Actual L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer are allocated dynamically by Host * and pushed into the per-core/thread free_queue. * - * Base size = sizeof(L2PerfDataHeader) + num_cores * sizeof(L2PerfBufferState) - * With phases = Base + sizeof(AicpuPhaseHeader) + num_threads * sizeof(PhaseBufferState) + * Base size = sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool) + * With phases = Base + sizeof(L2SwimlaneAicpuPhaseHeader) + num_threads * sizeof(L2SwimlaneAicpuPhasePool) */ -#ifndef SRC_A5_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_ -#define SRC_A5_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_ +#ifndef SRC_A5_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_ +#define SRC_A5_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_ #include #include @@ -60,13 +60,13 @@ #include "common/core_type.h" #include "common/platform_config.h" -// Maximum number of successor tasks per L2PerfRecord (matches Task::fanout) +// Maximum number of successor tasks per L2SwimlaneAicpuTaskRecord (matches Task::fanout) #ifndef RUNTIME_MAX_FANOUT #define RUNTIME_MAX_FANOUT 128 #endif // ============================================================================= -// L2 perf_level — granularity ladder for the L2 swimlane profiler. +// L2 swimlane_level — granularity ladder for the L2 swimlane profiler. // // Each level is a strict superset of the previous: higher levels add the data // described by their name on top of all lower-level data. Naming describes @@ -74,12 +74,12 @@ // naturally — e.g. `if (level >= SCHED_PHASES)` means "this section runs when // scheduler phase records are being collected (or any higher tier)". // -// Transported via `L2PerfDataHeader::l2_perf_level` (host → AICPU, +// Transported via `L2SwimlaneDataHeader::l2_swimlane_level` (host → AICPU, // shared memory) and `CallConfig::enable_l2_swimlane` (Python → C). The wire // representation stays integer (uint32_t / int32_t) for ABI stability; this // enum is the canonical in-code type used for comparisons. // ============================================================================= -enum class L2PerfLevel : uint32_t { +enum class L2SwimlaneLevel : uint32_t { DISABLED = 0, // No collection at all AICORE_TIMING = 1, // AICore per-task start/end timestamps + task record buffer AICPU_TIMING = 2, // + AICPU dispatch/finish timestamps + fanout dependency list @@ -88,13 +88,13 @@ enum class L2PerfLevel : uint32_t { }; // ============================================================================= -// L2PerfRecord - Single Task Execution Record +// L2SwimlaneAicpuTaskRecord - Single Task Execution Record // ============================================================================= /** * Single task execution record */ -struct L2PerfRecord { +struct L2SwimlaneAicpuTaskRecord { // Timing information (device clock timestamps) uint64_t start_time; // Task start timestamp (get_sys_cnt) uint64_t end_time; // Task end timestamp @@ -117,30 +117,33 @@ struct L2PerfRecord { int32_t fanout_count; // Number of successor tasks } __attribute__((aligned(64))); -static_assert(sizeof(L2PerfRecord) % 64 == 0, "L2PerfRecord must be 64-byte aligned for optimal cache performance"); +static_assert( + sizeof(L2SwimlaneAicpuTaskRecord) % 64 == 0, + "L2SwimlaneAicpuTaskRecord must be 64-byte aligned for optimal cache performance" +); // ============================================================================= -// L2PerfAicoreRing - Stable AICore→AICPU Staging Ring (per core, never rotated) +// L2SwimlaneAicoreRing - Stable AICore→AICPU Staging Ring (per core, never rotated) // ============================================================================= /** * Per-core staging ring written exclusively by AICore. * * AICore stores each task's timing in `dual_issue_slots[reg_task_id % - * PLATFORM_L2_AICORE_RING_SIZE]` and never touches any other L2Perf + * PLATFORM_L2_AICORE_RING_SIZE]` and never touches any other L2Swimlane * memory. The ring is allocated once by the host, addressed through - * `L2PerfBufferState[block_idx].aicore_ring_ptr` (also published into the - * `KernelArgs::aicore_l2_perf_ring_addrs` the AICore kernel entry - * forwards into `set_aicore_l2_perf_ring()`), and lives for the entire run + * `L2SwimlaneAicpuTaskPool[block_idx].aicore_ring_ptr` (also published into the + * `KernelArgs::aicore_l2_swimlane_ring_addrs` the AICore kernel entry + * forwards into `set_aicore_l2_swimlane_ring()`), and lives for the entire run * — its address is never reassigned, decoupling AICore writes from the * AICPU's records-buffer rotation. */ -struct L2PerfAicoreRing { - L2PerfRecord dual_issue_slots[PLATFORM_L2_AICORE_RING_SIZE]; +struct L2SwimlaneAicoreRing { + L2SwimlaneAicpuTaskRecord dual_issue_slots[PLATFORM_L2_AICORE_RING_SIZE]; } __attribute__((aligned(64))); // ============================================================================= -// L2PerfBuffer - Fixed-Size Record Buffer (AICPU-only) +// L2SwimlaneAicpuTaskBuffer - Fixed-Size Record Buffer (AICPU-only) // ============================================================================= /** @@ -151,16 +154,16 @@ struct L2PerfAicoreRing { * by AICPU when full. * * Owned and written exclusively by AICPU: AICore never touches this memory. - * AICPU reads timing from L2PerfAicoreRing::dual_issue_slots, fills in the + * AICPU reads timing from L2SwimlaneAicoreRing::dual_issue_slots, fills in the * AICPU-side fields, then commits into records[count++]. */ -struct L2PerfBuffer { - L2PerfRecord records[PLATFORM_PROF_BUFFER_SIZE]; // Committed records (AICPU writes) - volatile uint32_t count; // Current committed record count +struct L2SwimlaneAicpuTaskBuffer { + L2SwimlaneAicpuTaskRecord records[PLATFORM_PROF_BUFFER_SIZE]; // Committed records (AICPU writes) + volatile uint32_t count; // Current committed record count } __attribute__((aligned(64))); // ============================================================================= -// L2PerfFreeQueue - SPSC Lock-Free Queue for Free Buffers +// L2SwimlaneFreeQueue - SPSC Lock-Free Queue for Free Buffers // ============================================================================= /** @@ -178,17 +181,17 @@ struct L2PerfBuffer { * - Device pop: rmb() → read tail → read buffer_ptrs[head % COUNT] → rmb() → write head → wmb() * - Host push: write buffer_ptrs[tail % COUNT] → wmb() → write tail → wmb() */ -struct L2PerfFreeQueue { +struct L2SwimlaneFreeQueue { volatile uint64_t buffer_ptrs[PLATFORM_PROF_SLOT_COUNT]; // Free buffer addresses volatile uint32_t head; // Consumer read position (Device increments) volatile uint32_t tail; // Producer write position (Host increments) uint32_t pad[22]; // Pad to 128 bytes (aligned to cache line) } __attribute__((aligned(64))); -static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes for cache alignment"); +static_assert(sizeof(L2SwimlaneFreeQueue) == 128, "L2SwimlaneFreeQueue must be 128 bytes for cache alignment"); // ============================================================================= -// L2PerfBufferState - Per-Core/Thread Buffer State (Unified for L2PerfRecord and Phase) +// L2SwimlaneAicpuTaskPool - Per-Core/Thread Buffer State (Unified for L2SwimlaneAicpuTaskRecord and Phase) // ============================================================================= /** @@ -197,9 +200,9 @@ static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes * Contains: * - free_queue: SPSC queue of available buffer addresses * - current_buf_ptr: Currently active buffer being written (0 = no active buffer) - * - aicore_ring_ptr: Stable per-core L2PerfAicoreRing address (L2PerfRecord + * - aicore_ring_ptr: Stable per-core L2SwimlaneAicoreRing address (L2SwimlaneAicpuTaskRecord * profiling only; unused by Phase profiling). Set by host at init, read by - * AICPU in `l2_perf_aicpu_complete_record` to read the AICore-published + * AICPU in `l2_swimlane_aicpu_complete_task` to read the AICore-published * timing slots. Never reassigned during the run. * - current_buf_seq: Monotonic sequence number for ordering * - total_record_count / dropped_record_count / mismatch_record_count: @@ -209,9 +212,9 @@ static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes * violations (a hard error class, distinct from capacity drops). * * Used in two contexts: - * - Per-core L2PerfRecord profiling (current_buf_ptr → L2PerfBuffer, - * aicore_ring_ptr → L2PerfAicoreRing) - * - Per-thread Phase profiling (current_buf_ptr → PhaseBuffer, + * - Per-core L2SwimlaneAicpuTaskRecord profiling (current_buf_ptr → L2SwimlaneAicpuTaskBuffer, + * aicore_ring_ptr → L2SwimlaneAicoreRing) + * - Per-thread Phase profiling (current_buf_ptr → L2SwimlaneAicpuPhaseBuffer, * aicore_ring_ptr / mismatch_record_count unused) * * Writers: @@ -221,10 +224,10 @@ static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes * - current_buf_seq: Device writes (monotonic counter) * - aicore_ring_ptr: Host writes once at init, AICPU reads */ -struct L2PerfBufferState { - L2PerfFreeQueue free_queue; // SPSC queue of free buffer addresses +struct L2SwimlaneAicpuTaskPool { + L2SwimlaneFreeQueue free_queue; // SPSC queue of free buffer addresses volatile uint64_t current_buf_ptr; // Current active buffer (0 = none) - volatile uint64_t aicore_ring_ptr; // Stable AICore staging ring (L2Perf only; 0 for Phase) + volatile uint64_t aicore_ring_ptr; // Stable AICore staging ring (L2Swimlane only; 0 for Phase) volatile uint32_t current_buf_seq; // Sequence number for ordering volatile uint32_t total_record_count; // Records the AICPU attempted to write to this state volatile uint32_t dropped_record_count; // Records dropped (queue full / overwrite / no buffer) @@ -232,35 +235,43 @@ struct L2PerfBufferState { uint32_t pad[8]; // Pad to 192 bytes (aligned to cache line) } __attribute__((aligned(64))); -static_assert(sizeof(L2PerfBufferState) == 192, "L2PerfBufferState must be 192 bytes for cache alignment"); +static_assert(sizeof(L2SwimlaneAicpuTaskPool) == 192, "L2SwimlaneAicpuTaskPool must be 192 bytes for cache alignment"); // Type alias for semantic clarity in Phase profiling context -using PhaseBufferState = L2PerfBufferState; // Per-thread Phase profiling +using L2SwimlaneAicpuPhasePool = L2SwimlaneAicpuTaskPool; // Per-thread Phase profiling // ============================================================================= // ReadyQueueEntry - Queue Entry for Ready Buffers // ============================================================================= +/** + * Buffer kind for ReadyQueueEntry::kind. uint32_t underlying so the struct + * layout matches the prior `is_phase` field byte-for-byte. a5 currently uses + * only AicpuTask and AicpuPhase; AicoreTask is reserved for the AICore-as- + * producer migration to a5. + */ +enum class L2SwimlaneBufferKind : uint32_t { + AicpuTask = 0, // Per-core L2SwimlaneAicpuTaskBuffer, AICPU writes + AicpuPhase = 1, // Per-thread L2SwimlaneAicpuPhaseBuffer, AICPU writes + AicoreTask = 2, // Reserved (mirrors a2a3) +}; + /** * Ready queue entry * * When a buffer on a core/thread is full, AICPU adds this entry to the queue. * Host memory manager retrieves entries from the queue. - * - * Entry types (distinguished by is_phase flag): - * - L2PerfRecord entry: core_index = core ID, is_phase = 0 - * - Phase entry: core_index = thread_idx, is_phase = 1 */ struct ReadyQueueEntry { - uint32_t core_index; // Core index (0 ~ num_cores-1), or thread_idx for phase entries - uint32_t is_phase; // 0 = L2PerfRecord, 1 = Phase - uint64_t buffer_ptr; // Device pointer to the full buffer - uint32_t buffer_seq; // Sequence number for ordering - uint32_t pad; // Alignment padding + uint32_t core_index; // Core index (0 ~ num_cores-1), or thread_idx for phase entries + L2SwimlaneBufferKind kind; // Buffer kind discriminator (uint32_t underlying) + uint64_t buffer_ptr; // Device pointer to the full buffer + uint32_t buffer_seq; // Sequence number for ordering + uint32_t pad; // Alignment padding } __attribute__((aligned(32))); // ============================================================================= -// L2PerfDataHeader - Fixed Header +// L2SwimlaneDataHeader - Fixed Header // ============================================================================= /** @@ -279,7 +290,7 @@ struct ReadyQueueEntry { * - Queue empty: head == tail * - Queue full: (tail + 1) % capacity == head */ -struct L2PerfDataHeader { +struct L2SwimlaneDataHeader { // Per-thread ready queues (FIFO Circular Buffers) // Each AICPU thread has its own queue to avoid lock contention ReadyQueueEntry queues[PLATFORM_MAX_AICPU_THREADS][PLATFORM_PROF_READYQUEUE_SIZE]; @@ -287,10 +298,10 @@ struct L2PerfDataHeader { volatile uint32_t queue_tails[PLATFORM_MAX_AICPU_THREADS]; // Producer write positions (AICPU modifies) // Metadata (Host initializes, Device read-only) - uint32_t num_cores; // Actual number of cores launched - uint32_t l2_perf_level; // 0=off, 1=AICore timing, 2=+dispatch/fanout, - // 3=+sched phases, 4=+orch phases. Host writes - // at init; AICPU reads in l2_perf_aicpu_init. + uint32_t num_cores; // Actual number of cores launched + uint32_t l2_swimlane_level; // 0=off, 1=AICore timing, 2=+dispatch/fanout, + // 3=+sched phases, 4=+orch phases. Host writes + // at init; AICPU reads in l2_swimlane_aicpu_init. } __attribute__((aligned(64))); // ============================================================================= @@ -303,7 +314,7 @@ struct L2PerfDataHeader { * Scheduler phases (0-3): four phases in each scheduler loop iteration. * Orchestrator phases (16-24): sub-steps within each submit_task() call. */ -enum class AicpuPhaseId : uint32_t { +enum class L2SwimlaneAicpuPhaseId : uint32_t { // Scheduler phases (0-3) SCHED_COMPLETE = 0, // Process completed tasks (fanout traversal) SCHED_DISPATCH = 1, // Dispatch ready tasks to idle cores @@ -333,11 +344,11 @@ enum class AicpuPhaseId : uint32_t { * extra2 = pop_miss delta since last emit * All other phases: extras are 0 (reserved for future per-phase metrics). */ -struct AicpuPhaseRecord { - uint64_t start_time; // Phase start timestamp - uint64_t end_time; // Phase end timestamp - uint32_t loop_iter; // Loop iteration number - AicpuPhaseId phase_id; // Phase type +struct L2SwimlaneAicpuPhaseRecord { + uint64_t start_time; // Phase start timestamp + uint64_t end_time; // Phase end timestamp + uint32_t loop_iter; // Loop iteration number + L2SwimlaneAicpuPhaseId phase_id; // Phase type union { uint64_t task_id; // tensormap_and_ringbuffer: full PTO2 encoding // (ring_id << 32) | local_id for cross-view correlation. @@ -346,31 +357,31 @@ struct AicpuPhaseRecord { uint32_t extra1; // Phase-specific delta (e.g. SCHED_DISPATCH = pop_hit) uint32_t extra2; // Phase-specific delta (e.g. SCHED_DISPATCH = pop_miss) }; -static_assert(sizeof(AicpuPhaseRecord) == 40, "AicpuPhaseRecord layout drift"); +static_assert(sizeof(L2SwimlaneAicpuPhaseRecord) == 40, "L2SwimlaneAicpuPhaseRecord layout drift"); -constexpr uint32_t AICPU_PHASE_MAGIC = 0x41435048; // "ACPH" +constexpr uint32_t L2_SWIMLANE_AICPU_PHASE_MAGIC = 0x41435048; // "ACPH" /** - * Fixed-size phase record buffer (analogous to L2PerfBuffer) + * Fixed-size phase record buffer (analogous to L2SwimlaneAicpuTaskBuffer) * * Capacity: PLATFORM_PHASE_RECORDS_PER_THREAD * Allocated dynamically by Host, pushed into per-thread free_queue. */ -struct PhaseBuffer { - AicpuPhaseRecord records[PLATFORM_PHASE_RECORDS_PER_THREAD]; +struct L2SwimlaneAicpuPhaseBuffer { + L2SwimlaneAicpuPhaseRecord records[PLATFORM_PHASE_RECORDS_PER_THREAD]; volatile uint32_t count; } __attribute__((aligned(64))); /** * AICPU phase profiling header * - * Located after the L2PerfBufferState array in shared memory. + * Located after the L2SwimlaneAicpuTaskPool array in shared memory. * Contains metadata and per-thread tracking. */ -struct AicpuPhaseHeader { - uint32_t magic; // Validation magic (AICPU_PHASE_MAGIC) +struct L2SwimlaneAicpuPhaseHeader { + uint32_t magic; // Validation magic (L2_SWIMLANE_AICPU_PHASE_MAGIC) uint32_t num_sched_threads; // Number of scheduler threads - uint32_t records_per_thread; // Max records per PhaseBuffer + uint32_t records_per_thread; // Max records per L2SwimlaneAicpuPhaseBuffer uint32_t num_cores; // Total number of cores with valid assignments int8_t core_to_thread[PLATFORM_MAX_CORES]; // core_id → scheduler thread index (-1 = unassigned) } __attribute__((aligned(64))); @@ -387,41 +398,45 @@ extern "C" { * Calculate total memory size for performance data (buffer states only, no buffers) * * Formula: Total size = Fixed header + Dynamic tail - * = sizeof(L2PerfDataHeader) + num_cores × sizeof(L2PerfBufferState) + * = sizeof(L2SwimlaneDataHeader) + num_cores × sizeof(L2SwimlaneAicpuTaskPool) * * @param num_cores Number of cores (block_dim × PLATFORM_CORES_PER_BLOCKDIM) * @return Total bytes for header + buffer states */ inline size_t calc_perf_data_size(int num_cores) { - return sizeof(L2PerfDataHeader) + num_cores * sizeof(L2PerfBufferState); + return sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool); } /** * Get header pointer * * @param base_ptr Shared memory base address (device_ptr or host_ptr) - * @return L2PerfDataHeader pointer + * @return L2SwimlaneDataHeader pointer */ -inline L2PerfDataHeader *get_l2_perf_header(void *base_ptr) { return reinterpret_cast(base_ptr); } +inline L2SwimlaneDataHeader *get_l2_swimlane_header(void *base_ptr) { + return reinterpret_cast(base_ptr); +} /** - * Get L2PerfBufferState array start address + * Get L2SwimlaneAicpuTaskPool array start address * * @param base_ptr Shared memory base address - * @return L2PerfBufferState array pointer + * @return L2SwimlaneAicpuTaskPool array pointer */ -inline L2PerfBufferState *get_perf_buffer_states(void *base_ptr) { - return reinterpret_cast(reinterpret_cast(base_ptr) + sizeof(L2PerfDataHeader)); +inline L2SwimlaneAicpuTaskPool *get_perf_buffer_states(void *base_ptr) { + return reinterpret_cast( + reinterpret_cast(base_ptr) + sizeof(L2SwimlaneDataHeader) + ); } /** - * Get L2PerfBufferState for specified core + * Get L2SwimlaneAicpuTaskPool for specified core * * @param base_ptr Shared memory base address * @param core_index Core index (0 ~ num_cores-1) - * @return L2PerfBufferState pointer + * @return L2SwimlaneAicpuTaskPool pointer */ -inline L2PerfBufferState *get_perf_buffer_state(void *base_ptr, int core_index) { +inline L2SwimlaneAicpuTaskPool *get_perf_buffer_state(void *base_ptr, int core_index) { return &get_perf_buffer_states(base_ptr)[core_index]; } @@ -433,42 +448,45 @@ inline L2PerfBufferState *get_perf_buffer_state(void *base_ptr, int core_index) * @return Total bytes needed for header + all buffer states */ inline size_t calc_perf_data_size_with_phases(int num_cores, int num_sched_threads) { - return calc_perf_data_size(num_cores) + sizeof(AicpuPhaseHeader) + num_sched_threads * sizeof(PhaseBufferState); + return calc_perf_data_size(num_cores) + sizeof(L2SwimlaneAicpuPhaseHeader) + + num_sched_threads * sizeof(L2SwimlaneAicpuPhasePool); } /** - * Get AicpuPhaseHeader pointer (located after L2PerfBufferState array) + * Get L2SwimlaneAicpuPhaseHeader pointer (located after L2SwimlaneAicpuTaskPool array) * * @param base_ptr Shared memory base address * @param num_cores Number of AICore instances - * @return AicpuPhaseHeader pointer + * @return L2SwimlaneAicpuPhaseHeader pointer */ -inline AicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) { - return reinterpret_cast(reinterpret_cast(base_ptr) + calc_perf_data_size(num_cores)); +inline L2SwimlaneAicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) { + return reinterpret_cast( + reinterpret_cast(base_ptr) + calc_perf_data_size(num_cores) + ); } /** - * Get PhaseBufferState array start address (located after AicpuPhaseHeader) + * Get L2SwimlaneAicpuPhasePool array start address (located after L2SwimlaneAicpuPhaseHeader) * * @param base_ptr Shared memory base address * @param num_cores Number of AICore instances - * @return PhaseBufferState array pointer + * @return L2SwimlaneAicpuPhasePool array pointer */ -inline PhaseBufferState *get_phase_buffer_states(void *base_ptr, int num_cores) { - return reinterpret_cast( - reinterpret_cast(get_phase_header(base_ptr, num_cores)) + sizeof(AicpuPhaseHeader) +inline L2SwimlaneAicpuPhasePool *get_phase_buffer_states(void *base_ptr, int num_cores) { + return reinterpret_cast( + reinterpret_cast(get_phase_header(base_ptr, num_cores)) + sizeof(L2SwimlaneAicpuPhaseHeader) ); } /** - * Get PhaseBufferState for specified thread + * Get L2SwimlaneAicpuPhasePool for specified thread * * @param base_ptr Shared memory base address * @param num_cores Number of AICore instances * @param thread_idx Thread index - * @return PhaseBufferState pointer + * @return L2SwimlaneAicpuPhasePool pointer */ -inline PhaseBufferState *get_phase_buffer_state(void *base_ptr, int num_cores, int thread_idx) { +inline L2SwimlaneAicpuPhasePool *get_phase_buffer_state(void *base_ptr, int num_cores, int thread_idx) { return &get_phase_buffer_states(base_ptr, num_cores)[thread_idx]; } @@ -476,4 +494,4 @@ inline PhaseBufferState *get_phase_buffer_state(void *base_ptr, int num_cores, i } #endif -#endif // SRC_A5_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_ +#endif // SRC_A5_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_ diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h index 9d1c0cb57..75cddbf18 100644 --- a/src/a5/platform/include/common/platform_config.h +++ b/src/a5/platform/include/common/platform_config.h @@ -104,7 +104,7 @@ constexpr int PLATFORM_MAX_CORES = PLATFORM_MAX_BLOCKDIM * PLATFORM_CORES_PER_BL /** * Performance buffer capacity per buffer - * Number of L2PerfRecord entries per dynamically allocated L2PerfBuffer + * Number of L2SwimlaneAicpuTaskRecord entries per dynamically allocated L2SwimlaneAicpuTaskBuffer */ constexpr int PLATFORM_PROF_BUFFER_SIZE = 1000; @@ -118,13 +118,13 @@ constexpr int PLATFORM_PROF_BUFFER_SIZE = 1000; constexpr int PLATFORM_PROF_SLOT_COUNT = 4; /** - * L2PerfBuffer pre-allocation count per AICore. + * L2SwimlaneAicpuTaskBuffer pre-allocation count per AICore. * 1 goes into the free_queue at init, the rest into the recycled pool. */ constexpr int PLATFORM_PROF_BUFFERS_PER_CORE = 8; /** - * PhaseBuffer pre-allocation count per AICPU thread. + * L2SwimlaneAicpuPhaseBuffer pre-allocation count per AICPU thread. * 1 goes into the free_queue at init, the rest into the recycled pool. */ constexpr int PLATFORM_PROF_BUFFERS_PER_THREAD = 16; @@ -139,7 +139,7 @@ constexpr int PLATFORM_PROF_READYQUEUE_SIZE = /** * Performance buffer capacity per AICPU thread - * Maximum number of AicpuPhaseRecord entries per PhaseBuffer. + * Maximum number of L2SwimlaneAicpuPhaseRecord entries per L2SwimlaneAicpuPhaseBuffer. */ constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 500000; @@ -229,14 +229,14 @@ constexpr int PLATFORM_DUMP_TIMEOUT_SECONDS = 30; constexpr int PLATFORM_PMU_RECORDS_PER_BUFFER = 512; /** - * Per-core L2Perf staging ring depth (AICore-side WIP slots). + * Per-core L2Swimlane staging ring depth (AICore-side WIP slots). * * Must be ≥ the maximum number of in-flight tasks per core (today's * dual-issue dispatch keeps this at 2). The ring lives outside the - * rotating L2PerfBuffer so AICore's write address never changes mid-run. + * rotating L2SwimlaneAicpuTaskBuffer so AICore's write address never changes mid-run. * * Indexing uses `task_id % PLATFORM_L2_AICORE_RING_SIZE` (see - * `l2_perf_aicore_record_task`), so non-power-of-two values are correct + * `l2_swimlane_aicore_record_task`), so non-power-of-two values are correct * but compile to an integer divide on the AICore hot path. Prefer a power * of two so the compiler reduces the modulo to a mask. */ diff --git a/src/a5/platform/include/common/pmu_profiling.h b/src/a5/platform/include/common/pmu_profiling.h index ad2fb39d0..680c81c83 100644 --- a/src/a5/platform/include/common/pmu_profiling.h +++ b/src/a5/platform/include/common/pmu_profiling.h @@ -25,7 +25,7 @@ * * a5 has no halHostRegister (DAV_3510), so host↔device SPSC fields are * read/written via rtMemcpy (onboard) or memcpy (sim), using host shadow - * buffers — same pattern as a5 l2_perf_collector and tensor_dump_collector. + * buffers — same pattern as a5 l2_swimlane_collector and tensor_dump_collector. */ #ifndef SRC_A5_PLATFORM_INCLUDE_COMMON_PMU_PROFILING_H_ diff --git a/src/a5/platform/include/common/scope_stats.h b/src/a5/platform/include/common/scope_stats.h index 88efa72dd..844e34089 100644 --- a/src/a5/platform/include/common/scope_stats.h +++ b/src/a5/platform/include/common/scope_stats.h @@ -17,7 +17,7 @@ * scope_end — each carrying the task/heap ring start/end and the tensormap * live-entry count sampled at that boundary, tagged with a phase flag. Records * stream off the device in - * fixed-capacity buffers, mirroring PMU / dep_gen / tensor_dump / l2_perf (the + * fixed-capacity buffers, mirroring PMU / dep_gen / tensor_dump / l2_swimlane (the * single source of mgmt-loop truth is * src/a2a3/platform/include/host/profiling_common/profiler_base.h): * diff --git a/src/a5/platform/include/host/l2_perf_collector.h b/src/a5/platform/include/host/l2_swimlane_collector.h similarity index 66% rename from src/a5/platform/include/host/l2_perf_collector.h rename to src/a5/platform/include/host/l2_swimlane_collector.h index 2218f0952..dd5e33cc1 100644 --- a/src/a5/platform/include/host/l2_perf_collector.h +++ b/src/a5/platform/include/host/l2_swimlane_collector.h @@ -10,24 +10,24 @@ */ /** - * @file l2_perf_collector.h + * @file l2_swimlane_collector.h * @brief Platform-agnostic performance data collector with dynamic memory management. * * Architecture: - * - BufferPoolManager: shared mgmt-thread infrastructure that + * - BufferPoolManager: shared mgmt-thread infrastructure that * polls the AICPU ready queue, replenishes per-core / per-thread free * queues, and hands full buffers off to the collector thread. - * - L2PerfCollector: copies records from the manager's ready queue into + * - L2SwimlaneCollector: copies records from the manager's ready queue into * host vectors and exports the swimlane visualization. * * a5 specifics: device↔host transfers go through profiling_copy.h. The * framework's mgmt loop mirrors the shm region per tick; per-buffer - * payloads (L2PerfBuffer / PhaseBuffer) are pulled on demand inside + * payloads (L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer) are pulled on demand inside * ProfilerAlgorithms. */ -#ifndef SRC_A5_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_ -#define SRC_A5_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_ +#ifndef SRC_A5_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_ +#define SRC_A5_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_ #include #include @@ -37,29 +37,29 @@ #include #include -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "common/unified_log.h" #include "host/profiling_common/profiler_base.h" // --------------------------------------------------------------------------- -// L2 Perf profiling Module (drives BufferPoolManager) +// L2 Perf profiling Module (drives BufferPoolManager) // --------------------------------------------------------------------------- /** * L2 Perf has two distinct buffer kinds going through one ready queue per * AICPU thread: - * - kind 0: per-core L2PerfBuffer (task records) - * - kind 1: per-thread PhaseBuffer (scheduler/orchestrator phase records) - * The ReadyQueueEntry::is_phase flag picks between them. + * - kind 0: per-core L2SwimlaneAicpuTaskBuffer (task records) + * - kind 1: per-thread L2SwimlaneAicpuPhaseBuffer (scheduler/orchestrator phase records) + * The ReadyQueueEntry::kind flag picks between them. */ /** * Buffer kind discriminator carried in ReadyBufferInfo and used to index * the per-kind recycled pool inside BufferPoolManager. */ -enum class ProfBufferType { PERF_RECORD = 0, PHASE = 1 }; +enum class ProfBufferType { AICPU_TASK = 0, AICPU_PHASE = 1 }; /** * Information about a ready (full) buffer, passed from mgmt thread to @@ -74,16 +74,16 @@ struct ReadyBufferInfo { uint32_t buffer_seq; // Sequence number for ordering }; -struct L2PerfModule { - using DataHeader = L2PerfDataHeader; +struct L2SwimlaneModule { + using DataHeader = L2SwimlaneDataHeader; using ReadyEntry = ReadyQueueEntry; using ReadyBufferInfo = ::ReadyBufferInfo; - using FreeQueue = L2PerfFreeQueue; // PhaseBufferState aliases L2PerfBufferState + using FreeQueue = L2SwimlaneFreeQueue; // L2SwimlaneAicpuPhasePool aliases L2SwimlaneAicpuTaskPool static constexpr int kBufferKinds = 2; // 0=PERF_RECORD, 1=PHASE static constexpr uint32_t kReadyQueueSize = PLATFORM_PROF_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_PROF_SLOT_COUNT; - static constexpr const char *kSubsystemName = "L2PerfModule"; + static constexpr const char *kSubsystemName = "L2SwimlaneModule"; /** * batch_size for proactive_replenish's alloc fallback. Sized so that a @@ -99,39 +99,39 @@ struct L2PerfModule { static int kind_of(const ReadyBufferInfo &info) { return static_cast(info.type); } - static DataHeader *header_from_shm(void *shm) { return get_l2_perf_header(shm); } + static DataHeader *header_from_shm(void *shm) { return get_l2_swimlane_header(shm); } /** - * Branch on `is_phase` to pick the per-core perf state vs. the + * Branch on `entry.kind` to pick the per-core perf state vs. the * per-thread phase state. Returns nullopt for out-of-range indices * (which would otherwise corrupt unrelated BufferStates downstream). */ - static std::optional> + static std::optional> resolve_entry(void *shm, DataHeader *header, int /*q*/, const ReadyEntry &entry) { - const bool is_phase = (entry.is_phase != 0); + const bool is_phase = (entry.kind == L2SwimlaneBufferKind::AicpuPhase); const int num_cores = static_cast(header->num_cores); if (is_phase) { if (entry.core_index >= static_cast(PLATFORM_MAX_AICPU_THREADS)) { - LOG_ERROR("L2PerfModule: invalid phase entry: thread=%u", entry.core_index); + LOG_ERROR("L2SwimlaneModule: invalid phase entry: thread=%u", entry.core_index); return std::nullopt; } } else { if (entry.core_index >= static_cast(num_cores)) { - LOG_ERROR("L2PerfModule: invalid perf entry: core=%u", entry.core_index); + LOG_ERROR("L2SwimlaneModule: invalid perf entry: core=%u", entry.core_index); return std::nullopt; } } - L2PerfBufferState *state = is_phase ? - get_phase_buffer_state(shm, num_cores, static_cast(entry.core_index)) : - get_perf_buffer_state(shm, static_cast(entry.core_index)); + L2SwimlaneAicpuTaskPool *state = + is_phase ? get_phase_buffer_state(shm, num_cores, static_cast(entry.core_index)) : + get_perf_buffer_state(shm, static_cast(entry.core_index)); - profiling_common::EntrySite site; - site.kind = is_phase ? 1 : 0; + profiling_common::EntrySite site; + site.kind = static_cast(entry.kind); site.free_queue = &state->free_queue; - site.buffer_size = is_phase ? sizeof(PhaseBuffer) : sizeof(L2PerfBuffer); - site.info.type = is_phase ? ProfBufferType::PHASE : ProfBufferType::PERF_RECORD; + site.buffer_size = is_phase ? sizeof(L2SwimlaneAicpuPhaseBuffer) : sizeof(L2SwimlaneAicpuTaskBuffer); + site.info.type = is_phase ? ProfBufferType::AICPU_PHASE : ProfBufferType::AICPU_TASK; site.info.index = entry.core_index; site.info.slot_idx = 0; site.info.dev_buffer_ptr = reinterpret_cast(entry.buffer_ptr); @@ -146,17 +146,18 @@ struct L2PerfModule { // Per-core perf states (kind 0) for (int i = 0; i < num_cores; i++) { - L2PerfBufferState *state = get_perf_buffer_state(shm, i); - cb(/*kind=*/0, &state->free_queue, sizeof(L2PerfBuffer)); + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm, i); + cb(/*kind=*/0, &state->free_queue, sizeof(L2SwimlaneAicpuTaskBuffer)); } - // Per-thread phase states (kind 1) — gated on AicpuPhaseHeader being + // Per-thread phase states (kind 1) — gated on L2SwimlaneAicpuPhaseHeader being // initialized (runtimes that don't emit phase records leave it zero). - AicpuPhaseHeader *ph = get_phase_header(shm, num_cores); - const int num_phase_threads = (ph->magic == AICPU_PHASE_MAGIC) ? static_cast(ph->num_sched_threads) : 0; + L2SwimlaneAicpuPhaseHeader *ph = get_phase_header(shm, num_cores); + const int num_phase_threads = + (ph->magic == L2_SWIMLANE_AICPU_PHASE_MAGIC) ? static_cast(ph->num_sched_threads) : 0; for (int t = 0; t < num_phase_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(shm, num_cores, t); - cb(/*kind=*/1, &state->free_queue, sizeof(PhaseBuffer)); + L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm, num_cores, t); + cb(/*kind=*/1, &state->free_queue, sizeof(L2SwimlaneAicpuPhaseBuffer)); } } }; @@ -167,13 +168,13 @@ struct L2PerfModule { // because they wrap stateless HAL globals. On a5 onboard the runner passes // register_cb=nullptr and the framework installs a malloc-shadow + DMA // fallback (default_host_shadow_register). -using L2PerfAllocCallback = profiling_common::ProfAllocCallback; -using L2PerfRegisterCallback = profiling_common::ProfRegisterCallback; -using L2PerfUnregisterCallback = profiling_common::ProfUnregisterCallback; -using L2PerfFreeCallback = profiling_common::ProfFreeCallback; +using L2SwimlaneAllocCallback = profiling_common::ProfAllocCallback; +using L2SwimlaneRegisterCallback = profiling_common::ProfRegisterCallback; +using L2SwimlaneUnregisterCallback = profiling_common::ProfUnregisterCallback; +using L2SwimlaneFreeCallback = profiling_common::ProfFreeCallback; // ============================================================================= -// L2PerfCollector +// L2SwimlaneCollector // ============================================================================= /** @@ -192,7 +193,7 @@ using L2PerfFreeCallback = profiling_common::ProfFreeCallback; * entries have a consumer). * 5. read_phase_header_metadata() — single-shot read of the * core→thread mapping from the - * AicpuPhaseHeader. + * L2SwimlaneAicpuPhaseHeader. * 6. reconcile_counters() — leftover-active sanity check (a5 lacks * total/dropped/mismatch counters until * the staging-ring redesign lands). @@ -202,31 +203,31 @@ using L2PerfFreeCallback = profiling_common::ProfFreeCallback; * device flush is the only data path. Any non-zero `current_buf_ptr` after * stop() with non-empty count is logged as a bug. */ -class L2PerfCollector : public profiling_common::ProfilerBase { +class L2SwimlaneCollector : public profiling_common::ProfilerBase { public: - L2PerfCollector() = default; - ~L2PerfCollector(); + L2SwimlaneCollector() = default; + ~L2SwimlaneCollector(); - L2PerfCollector(const L2PerfCollector &) = delete; - L2PerfCollector &operator=(const L2PerfCollector &) = delete; + L2SwimlaneCollector(const L2SwimlaneCollector &) = delete; + L2SwimlaneCollector &operator=(const L2SwimlaneCollector &) = delete; // ProfilerBase contract static constexpr int kIdleTimeoutSec = PLATFORM_PROF_TIMEOUT_SECONDS; - static constexpr const char *kSubsystemName = "L2Perf"; + static constexpr const char *kSubsystemName = "L2Swimlane"; /** * Initialize performance profiling. * * Allocates the shared-memory region (header + per-core / per-thread - * BufferStates), pre-allocates initial L2PerfBuffers and PhaseBuffers, + * BufferStates), pre-allocates initial L2SwimlaneAicpuTaskBuffers and PhaseBuffers, * and seeds the per-pool free_queues + the framework's recycled pools. * * @param num_aicore Number of AICore instances * @param device_id Device ID (forwarded to register_cb) - * @param l2_perf_level Collection granularity (DISABLED / AICORE_TIMING + * @param l2_swimlane_level Collection granularity (DISABLED / AICORE_TIMING * / AICPU_TIMING / SCHED_PHASES / ORCH_PHASES). - * Written into `L2PerfDataHeader::l2_perf_level` - * so AICPU can promote it in `l2_perf_aicpu_init`, + * Written into `L2SwimlaneDataHeader::l2_swimlane_level` + * so AICPU can promote it in `l2_swimlane_aicpu_init`, * AND cached on the collector so * `export_swimlane_json()` can gate phase sections * and stamp the JSON `version`. @@ -235,27 +236,27 @@ class L2PerfCollector : public profiling_common::ProfilerBase/l2_perf_records.json — directory captured at + * Writes /l2_swimlane_records.json — directory captured at * initialize() time. * * @return 0 on success, error code on failure @@ -271,7 +272,7 @@ class L2PerfCollector : public profiling_common::ProfilerBase> &get_records() const { return collected_perf_records_; } + const std::vector> &get_records() const { return collected_perf_records_; } private: // Shared memory pointers. shm_host_ / device_id_ live on ProfilerBase @@ -326,25 +327,25 @@ class L2PerfCollector : public profiling_common::ProfilerBase aicore_rings_dev_; void *aicore_ring_addrs_dev_{nullptr}; void *aicore_ring_addrs_host_{nullptr}; int num_aicore_{0}; - L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; + L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED}; // Per-task output directory captured at initialize() time. Consumed by - // export_swimlane_json() to build /l2_perf_records.json. + // export_swimlane_json() to build /l2_swimlane_records.json. std::string output_prefix_; // Collected data (per-core vectors, indexed by core_index) - std::vector> collected_perf_records_; + std::vector> collected_perf_records_; // AICPU phase profiling data (per-thread, mixed sched + orch records) - std::vector> collected_phase_records_; + std::vector> collected_phase_records_; bool has_phase_data_{false}; // Core-to-thread mapping (core_id → scheduler thread index, -1 = unassigned) @@ -356,7 +357,7 @@ class L2PerfCollector : public profiling_common::ProfilerBase dev_to_host_; - // Per-kind recycled buffer pools (vector indexed by Module's BufferKind id) + // Per-kind recycled buffer pools (vector indexed by Module-defined kind id) std::vector> recycled_; }; diff --git a/src/a5/platform/include/host/profiling_common/profiler_base.h b/src/a5/platform/include/host/profiling_common/profiler_base.h index 94ebcad87..496fa9ebb 100644 --- a/src/a5/platform/include/host/profiling_common/profiler_base.h +++ b/src/a5/platform/include/host/profiling_common/profiler_base.h @@ -11,7 +11,7 @@ /** * @file profiler_base.h - * @brief CRTP scaffolding shared by L2Perf / Dump / PMU collectors. + * @brief CRTP scaffolding shared by L2Swimlane / Dump / PMU collectors. * * Owns the BufferPoolManager, the mgmt thread (which polls AICPU * ready queues and recycles buffers), and the collector poll thread. @@ -19,12 +19,12 @@ * Module concept contract * ----------------------- * - * Each profiling subsystem provides a `Module` struct (e.g., L2PerfModule, + * Each profiling subsystem provides a `Module` struct (e.g., L2SwimlaneModule, * DumpModule, PmuModule) that supplies the data-layout traits the unified * mgmt-loop algorithms (ProfilerAlgorithms) need. Required members: * * // Types - * using DataHeader = ...; // Shared-memory header (e.g. L2PerfDataHeader). + * using DataHeader = ...; // Shared-memory header (e.g. L2SwimlaneDataHeader). * using ReadyEntry = ...; // Per-AICPU-thread ready-queue entry. * using ReadyBufferInfo = ...; // Hand-off struct to the collector thread * // (carries dev/host ptrs, optional kind @@ -34,10 +34,10 @@ * // `buffer_ptrs[kSlotCount]`. * * // Constants - * static constexpr int kBufferKinds; // L2Perf=2 (perf+phase), Dump=1, PMU=1. + * static constexpr int kBufferKinds; // L2Swimlane=2 (perf+phase), Dump=1, PMU=1. * static constexpr uint32_t kReadyQueueSize; // Per-thread ready-queue depth. * static constexpr uint32_t kSlotCount; // FreeQueue::buffer_ptrs[] length. - * static constexpr const char* kSubsystemName; // "PMU" / "L2Perf" / "Dump". + * static constexpr const char* kSubsystemName; // "PMU" / "L2Swimlane" / "Dump". * * // Header pointer cast (host_ptr → DataHeader*) * static DataHeader* header_from_shm(void* shared_mem_host); @@ -115,7 +115,7 @@ * `write_range_to_device` writes. The bulk `mirror_shm_to_device` is * intentionally NOT called from mgmt_loop: it raced with AICPU writes * to device-only fields (current_buf_ptr, total/dropped/mismatch - * counters, queue_tails, free_queue.head, AicpuPhaseHeader::magic) and + * counters, queue_tails, free_queue.head, L2SwimlaneAicpuPhaseHeader::magic) and * rolled them back to the host-shadow values mirrored in at the top of * the tick. Buffer contents are mirrored on demand inside * ProfilerAlgorithms. @@ -138,7 +138,7 @@ * (use the subsystem's PLATFORM_*_TIMEOUT_SECONDS). * * static constexpr const char* kSubsystemName; - * Used in the idle-timeout log line (e.g. "L2Perf", "PMU", "TensorDump"). + * Used in the idle-timeout log line (e.g. "L2Swimlane", "PMU", "TensorDump"). */ #ifndef SRC_A5_PLATFORM_INCLUDE_HOST_PROFILING_COMMON_PROFILER_BASE_H_ @@ -162,7 +162,7 @@ namespace profiling_common { // Common subsystem callback signatures. All four collectors (PMU / TensorDump -// / L2Perf / DepGen) used to declare their own typedefs with identical +// / L2Swimlane / DepGen) used to declare their own typedefs with identical // shapes; these are the canonical types stashed in ProfilerBase via // set_memory_context(). // @@ -590,7 +590,7 @@ class ProfilerBase { * * The bulk `mirror_shm_to_device` deliberately is NOT called: it races * with AICPU writes to device-only fields (current_buf_ptr, total/dropped/ - * mismatch counters, queue_tails, free_queue.head, AicpuPhaseHeader::magic, + * mismatch counters, queue_tails, free_queue.head, L2SwimlaneAicpuPhaseHeader::magic, * core_to_thread[]) and rolls them back to whatever was mirrored in at * the start of the tick. Each host-side modification is written back as * a narrow field write inside Alg. diff --git a/src/a5/platform/onboard/aicore/kernel.cpp b/src/a5/platform/onboard/aicore/kernel.cpp index 6789b66b0..d53305f4e 100644 --- a/src/a5/platform/onboard/aicore/kernel.cpp +++ b/src/a5/platform/onboard/aicore/kernel.cpp @@ -15,7 +15,7 @@ #include "aicore/aicore_profiling_state.h" #include "common/core_type.h" #include "common/kernel_args.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "common/pmu_profiling.h" @@ -47,17 +47,19 @@ class Runtime; // linker dedup the otherwise-duplicate symbol definitions across the two // compilation units. [[block_local]] static uint32_t s_aicore_profiling_flag; -[[block_local]] static __gm__ L2PerfAicoreRing *s_aicore_l2_perf_ring; +[[block_local]] static __gm__ L2SwimlaneAicoreRing *s_aicore_l2_swimlane_ring; [[block_local]] static __gm__ PmuAicoreRing *s_aicore_pmu_ring; [[block_local]] static uint64_t s_aicore_pmu_reg_base; __attribute__((weak)) __aicore__ void set_aicore_profiling_flag(uint32_t flag) { s_aicore_profiling_flag = flag; } __attribute__((weak)) __aicore__ uint32_t get_aicore_profiling_flag() { return s_aicore_profiling_flag; } -__attribute__((weak)) __aicore__ void set_aicore_l2_perf_ring(__gm__ L2PerfAicoreRing *ring) { - s_aicore_l2_perf_ring = ring; +__attribute__((weak)) __aicore__ void set_aicore_l2_swimlane_ring(__gm__ L2SwimlaneAicoreRing *ring) { + s_aicore_l2_swimlane_ring = ring; +} +__attribute__((weak)) __aicore__ __gm__ L2SwimlaneAicoreRing *get_aicore_l2_swimlane_ring() { + return s_aicore_l2_swimlane_ring; } -__attribute__((weak)) __aicore__ __gm__ L2PerfAicoreRing *get_aicore_l2_perf_ring() { return s_aicore_l2_perf_ring; } __attribute__((weak)) __aicore__ void set_aicore_pmu_ring(__gm__ PmuAicoreRing *ring) { s_aicore_pmu_ring = ring; } __attribute__((weak)) __aicore__ __gm__ PmuAicoreRing *get_aicore_pmu_ring() { return s_aicore_pmu_ring; } @@ -80,7 +82,7 @@ extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, Co * * Each core (AIC or AIV) gets its own handshake buffer indexed by block_idx. * Profiling state flows from KernelArgs into platform-owned per-core slots - * via set_aicore_profiling_flag() / set_aicore_l2_perf_ring() / + * via set_aicore_profiling_flag() / set_aicore_l2_swimlane_ring() / * set_aicore_pmu_ring() / set_aicore_pmu_reg_base(); the runtime's * Handshake stays profiling-free and aicore_execute keeps its original * signature. @@ -105,14 +107,14 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ KernelA // does not depend on any AICPU init ordering. set_aicore_profiling_flag(k_args->enable_profiling_flag); if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)) { - __gm__ uint64_t *ring_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_l2_perf_ring_addrs); + __gm__ uint64_t *ring_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_l2_swimlane_ring_addrs); if (ring_table != nullptr) { - set_aicore_l2_perf_ring(reinterpret_cast<__gm__ L2PerfAicoreRing *>(ring_table[block_idx])); + set_aicore_l2_swimlane_ring(reinterpret_cast<__gm__ L2SwimlaneAicoreRing *>(ring_table[block_idx])); } else { - set_aicore_l2_perf_ring(nullptr); + set_aicore_l2_swimlane_ring(nullptr); } } else { - set_aicore_l2_perf_ring(nullptr); + set_aicore_l2_swimlane_ring(nullptr); } if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_PMU)) { __gm__ uint64_t *pmu_ring_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_pmu_ring_addrs); diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp index 0f18c9909..1761b8a64 100644 --- a/src/a5/platform/onboard/aicpu/kernel.cpp +++ b/src/a5/platform/onboard/aicpu/kernel.cpp @@ -15,7 +15,7 @@ #include "common/platform_config.h" #include "aicpu/device_log.h" #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/platform_regs.h" #include "aicpu/platform_aicpu_affinity.h" #include "aicpu/pmu_collector_aicpu.h" @@ -105,7 +105,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a set_orch_device_id(static_cast(k_args->device_id)); set_platform_dump_base(k_args->dump_data_base); set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR)); - set_platform_l2_perf_base(k_args->l2_perf_data_base); + set_platform_l2_swimlane_base(k_args->l2_swimlane_data_base); set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)); set_platform_pmu_base(k_args->pmu_data_base); set_pmu_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_PMU)); diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index d535ced0a..d613f0b46 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -43,7 +43,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_swimlane_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/scope_stats_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 122d0181a..80199a3f9 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -108,9 +108,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { // Initialize per-subsystem shared memory. if (enable_l2_swimlane_) { - rc = init_l2_perf(num_aicore, device_id_); + rc = init_l2_swimlane(num_aicore, device_id_); if (rc != 0) { - LOG_ERROR("init_l2_perf failed: %d", rc); + LOG_ERROR("init_l2_swimlane failed: %d", rc); return rc; } } @@ -219,8 +219,8 @@ int DeviceRunner::finalize() { // shadows). All four shared collectors use the same alloc/free shape // on a5: no unregister callback (a5 doesn't use halHostRegister) + // prof_free_cb (rtFree directly). - if (l2_perf_collector_.is_initialized()) { - l2_perf_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); + if (l2_swimlane_collector_.is_initialized()) { + l2_swimlane_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); } if (dump_collector_.is_initialized()) { dump_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); @@ -252,8 +252,8 @@ int DeviceRunner::finalize() { // `launch_aicpu_kernel` and `launch_aicore_kernel` live on `DeviceRunnerBase`. void DeviceRunner::finalize_collectors() { - if (l2_perf_collector_.is_initialized()) { - l2_perf_collector_.stop(); + if (l2_swimlane_collector_.is_initialized()) { + l2_swimlane_collector_.stop(); } if (dump_collector_.is_initialized()) { dump_collector_.stop(); @@ -263,15 +263,15 @@ void DeviceRunner::finalize_collectors() { } } -int DeviceRunner::init_l2_perf(int num_aicore, int device_id) { - int rc = l2_perf_collector_.initialize( - num_aicore, device_id, l2_perf_level_, prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, output_prefix_ +int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) { + int rc = l2_swimlane_collector_.initialize( + num_aicore, device_id, l2_swimlane_level_, prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, output_prefix_ ); if (rc == 0) { - kernel_args_.args.l2_perf_data_base = - reinterpret_cast(l2_perf_collector_.get_l2_perf_setup_device_ptr()); - kernel_args_.args.aicore_l2_perf_ring_addrs = - reinterpret_cast(l2_perf_collector_.get_aicore_ring_addrs_device_ptr()); + kernel_args_.args.l2_swimlane_data_base = + reinterpret_cast(l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr()); + kernel_args_.args.aicore_l2_swimlane_ring_addrs = + reinterpret_cast(l2_swimlane_collector_.get_aicore_ring_addrs_device_ptr()); } return rc; } diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index aaf7a6b30..332169b00 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -43,12 +43,12 @@ #include "device_runner_helpers.h" // common DeviceArgs + KernelArgsHelper #include "common/kernel_args.h" #include "common/memory_barrier.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "common/unified_log.h" #include "host/function_cache.h" #include "host/memory_allocator.h" -#include "host/l2_perf_collector.h" +#include "host/l2_swimlane_collector.h" #include "host/pmu_collector.h" #include "host/scope_stats_collector.h" #include "host/tensor_dump_collector.h" @@ -141,7 +141,7 @@ class DeviceRunner : public DeviceRunnerBase { // (`ChipCallableBuffer`, `CallableState`, `OrchSoBuffer`) are // inherited from `DeviceRunnerBase`. - // Shared collectors (`l2_perf_collector_`, `dump_collector_`, + // Shared collectors (`l2_swimlane_collector_`, `dump_collector_`, // `pmu_collector_`, `scope_stats_collector_`) live on `DeviceRunnerBase`. // `query_max_block_dim`, `validate_block_dim`, `ensure_binaries_loaded`, @@ -151,16 +151,16 @@ class DeviceRunner : public DeviceRunnerBase { /** * Initialize performance profiling device buffers * - * Allocates L2PerfSetupHeader and per-core/per-thread buffers on device; - * caller publishes the device pointer via kernel_args.l2_perf_data_base - * (AICPU reads it through get_platform_l2_perf_base()). + * Allocates L2SwimlaneSetupHeader and per-core/per-thread buffers on device; + * caller publishes the device pointer via kernel_args.l2_swimlane_data_base + * (AICPU reads it through get_platform_l2_swimlane_base()). * * @param runtime Runtime instance to configure * @param num_aicore Number of AICore instances * @param device_id Device ID * @return 0 on success, error code on failure */ - int init_l2_perf(int num_aicore, int device_id); + int init_l2_swimlane(int num_aicore, int device_id); /** * Initialize tensor dump device buffers. @@ -180,7 +180,7 @@ class DeviceRunner : public DeviceRunnerBase { * Signature matches a2a3 for cross-platform consistency. */ // Shared enable flags (`enable_l2_swimlane_`, `enable_dump_tensor_`, - // `enable_pmu_`, `enable_scope_stats_`, `l2_perf_level_`, + // `enable_pmu_`, `enable_scope_stats_`, `l2_swimlane_level_`, // `pmu_event_type_`, `output_prefix_`) live on `DeviceRunnerBase`. int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id); diff --git a/src/a5/platform/sim/aicore/inner_kernel.h b/src/a5/platform/sim/aicore/inner_kernel.h index 42151f020..46c05f18c 100644 --- a/src/a5/platform/sim/aicore/inner_kernel.h +++ b/src/a5/platform/sim/aicore/inner_kernel.h @@ -38,12 +38,12 @@ // - with CACHELINE_OUT: write-back/flush (write to memory) -> release semantics // On aarch64, acquire-only fences do NOT prevent store-store reordering across the // barrier, so using acquire for the flush direction causes a race: the AICPU can -// observe the COND register FIN signal before l2_perf_buf->count is visible. +// observe the COND register FIN signal before l2_swimlane_buf->count is visible. // Using seq_cst (dmb ish / full barrier) covers both directions safely. // Use variadic macro to support both 2-arg and 3-arg calls. #define dcci(...) std::atomic_thread_fence(std::memory_order_seq_cst) -// dsb / mem_dsb_t — CANN provides these on real AICore; l2_perf_collector uses them after dcci flush. +// dsb / mem_dsb_t — CANN provides these on real AICore; l2_swimlane_collector uses them after dcci flush. // Simulation: full fence (same strength as dcci above) so AICPU ordering matches hardware intent. typedef int mem_dsb_t; #define dsb(_kind) \ diff --git a/src/a5/platform/sim/aicore/kernel.cpp b/src/a5/platform/sim/aicore/kernel.cpp index a81ff2d9b..414a2e125 100644 --- a/src/a5/platform/sim/aicore/kernel.cpp +++ b/src/a5/platform/sim/aicore/kernel.cpp @@ -23,7 +23,7 @@ #include "aicore/aicore.h" #include "aicore/aicore_profiling_state.h" #include "common/core_type.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "common/pmu_profiling.h" #include "runtime.h" @@ -35,7 +35,7 @@ static pthread_key_t g_reg_base_key; static pthread_key_t g_core_id_key; static pthread_key_t g_block_idx_key; static pthread_key_t g_aicore_profiling_flag_key; -static pthread_key_t g_aicore_l2_perf_ring_key; +static pthread_key_t g_aicore_l2_swimlane_ring_key; static pthread_key_t g_aicore_pmu_ring_key; static pthread_key_t g_pmu_reg_base_key; static pthread_once_t g_tls_once = PTHREAD_ONCE_INIT; @@ -45,7 +45,7 @@ static void create_tls_keys() { pthread_key_create(&g_core_id_key, nullptr); pthread_key_create(&g_block_idx_key, nullptr); pthread_key_create(&g_aicore_profiling_flag_key, nullptr); - pthread_key_create(&g_aicore_l2_perf_ring_key, nullptr); + pthread_key_create(&g_aicore_l2_swimlane_ring_key, nullptr); pthread_key_create(&g_aicore_pmu_ring_key, nullptr); pthread_key_create(&g_pmu_reg_base_key, nullptr); } @@ -68,11 +68,11 @@ __aicore__ uint32_t get_aicore_profiling_flag() { return static_cast(reinterpret_cast(pthread_getspecific(g_aicore_profiling_flag_key))); } -__aicore__ void set_aicore_l2_perf_ring(__gm__ L2PerfAicoreRing *ring) { - pthread_setspecific(g_aicore_l2_perf_ring_key, reinterpret_cast(ring)); +__aicore__ void set_aicore_l2_swimlane_ring(__gm__ L2SwimlaneAicoreRing *ring) { + pthread_setspecific(g_aicore_l2_swimlane_ring_key, reinterpret_cast(ring)); } -__aicore__ __gm__ L2PerfAicoreRing *get_aicore_l2_perf_ring() { - return reinterpret_cast<__gm__ L2PerfAicoreRing *>(pthread_getspecific(g_aicore_l2_perf_ring_key)); +__aicore__ __gm__ L2SwimlaneAicoreRing *get_aicore_l2_swimlane_ring() { + return reinterpret_cast<__gm__ L2SwimlaneAicoreRing *>(pthread_getspecific(g_aicore_l2_swimlane_ring_key)); } __aicore__ void set_aicore_pmu_ring(__gm__ PmuAicoreRing *ring) { @@ -111,7 +111,7 @@ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type); // executor with its original signature. extern "C" void aicore_execute_wrapper( __gm__ Runtime *runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs, - uint32_t enable_profiling_flag, uint64_t aicore_l2_perf_ring_addrs, uint64_t aicore_pmu_ring_addrs + uint32_t enable_profiling_flag, uint64_t aicore_l2_swimlane_ring_addrs, uint64_t aicore_pmu_ring_addrs ) { pthread_once(&g_tls_once, create_tls_keys); @@ -130,11 +130,11 @@ extern "C" void aicore_execute_wrapper( // Publish per-core profiling state before the executor runs. set_aicore_profiling_flag(enable_profiling_flag); - if ((enable_profiling_flag & PROFILING_FLAG_L2_SWIMLANE) && aicore_l2_perf_ring_addrs != 0) { - uint64_t *ring_table = reinterpret_cast(aicore_l2_perf_ring_addrs); - set_aicore_l2_perf_ring(reinterpret_cast<__gm__ L2PerfAicoreRing *>(ring_table[block_idx])); + if ((enable_profiling_flag & PROFILING_FLAG_L2_SWIMLANE) && aicore_l2_swimlane_ring_addrs != 0) { + uint64_t *ring_table = reinterpret_cast(aicore_l2_swimlane_ring_addrs); + set_aicore_l2_swimlane_ring(reinterpret_cast<__gm__ L2SwimlaneAicoreRing *>(ring_table[block_idx])); } else { - set_aicore_l2_perf_ring(nullptr); + set_aicore_l2_swimlane_ring(nullptr); } if ((enable_profiling_flag & PROFILING_FLAG_PMU) && aicore_pmu_ring_addrs != 0) { uint64_t *pmu_ring_table = reinterpret_cast(aicore_pmu_ring_addrs); diff --git a/src/a5/platform/sim/host/CMakeLists.txt b/src/a5/platform/sim/host/CMakeLists.txt index 88b9cd32f..217d94a6a 100644 --- a/src/a5/platform/sim/host/CMakeLists.txt +++ b/src/a5/platform/sim/host/CMakeLists.txt @@ -44,7 +44,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/profiling_copy.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_swimlane_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/scope_stats_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index 85b79fbdf..1cf0bcef8 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -47,7 +47,7 @@ typedef int (*aicpu_execute_func_t)(Runtime *runtime); typedef void (*aicore_execute_func_t)( Runtime *runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs, - uint32_t enable_profiling_flag, uint64_t aicore_l2_perf_ring_addrs, uint64_t aicore_pmu_ring_addrs + uint32_t enable_profiling_flag, uint64_t aicore_l2_swimlane_ring_addrs, uint64_t aicore_pmu_ring_addrs ); typedef void (*set_platform_regs_func_t)(uint64_t regs); @@ -253,10 +253,10 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } - set_platform_l2_perf_base_func_ = - reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_l2_perf_base")); - if (set_platform_l2_perf_base_func_ == nullptr) { - LOG_ERROR("dlsym failed for set_platform_l2_perf_base: %s", dlerror()); + set_platform_l2_swimlane_base_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_l2_swimlane_base")); + if (set_platform_l2_swimlane_base_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_platform_l2_swimlane_base: %s", dlerror()); return -1; } @@ -478,9 +478,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { // Initialize per-subsystem shared memory. if (enable_l2_swimlane_) { - rc = init_l2_perf(num_aicore, device_id_); + rc = init_l2_swimlane(num_aicore, device_id_); if (rc != 0) { - LOG_ERROR("init_l2_perf failed: %d", rc); + LOG_ERROR("init_l2_swimlane failed: %d", rc); return rc; } } @@ -567,7 +567,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { set_platform_regs_func_(kernel_args_.regs); set_platform_dump_base_func_(kernel_args_.dump_data_base); set_dump_tensor_enabled_func_(enable_dump_tensor_); - set_platform_l2_perf_base_func_(kernel_args_.l2_perf_data_base); + set_platform_l2_swimlane_base_func_(kernel_args_.l2_swimlane_data_base); set_l2_swimlane_enabled_func_(enable_l2_swimlane_); set_platform_pmu_base_func_(kernel_args_.pmu_data_base); set_pmu_enabled_func_(enable_pmu_); @@ -586,7 +586,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return create_thread(std::move(fn)); }; if (enable_l2_swimlane_) { - l2_perf_collector_.start(thread_factory); + l2_swimlane_collector_.start(thread_factory); } if (enable_dump_tensor_) { dump_collector_.start(thread_factory); @@ -640,7 +640,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { aicore_threads.push_back(create_thread([this, &runtime, i, core_type, physical_core_id]() { aicore_execute_func_( &runtime, i, core_type, physical_core_id, kernel_args_.regs, kernel_args_.enable_profiling_flag, - kernel_args_.aicore_l2_perf_ring_addrs, kernel_args_.aicore_pmu_ring_addrs + kernel_args_.aicore_l2_swimlane_ring_addrs, kernel_args_.aicore_pmu_ring_addrs ); })); } @@ -674,10 +674,10 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { // directory the user set on CallConfig (validate() enforces non-empty // upstream). if (enable_l2_swimlane_) { - l2_perf_collector_.stop(); - l2_perf_collector_.read_phase_header_metadata(); - l2_perf_collector_.reconcile_counters(); - l2_perf_collector_.export_swimlane_json(); + l2_swimlane_collector_.stop(); + l2_swimlane_collector_.read_phase_header_metadata(); + l2_swimlane_collector_.reconcile_counters(); + l2_swimlane_collector_.export_swimlane_json(); } if (enable_dump_tensor_) { @@ -739,7 +739,7 @@ void DeviceRunner::unload_executor_binaries() { set_platform_regs_func_ = nullptr; set_platform_dump_base_func_ = nullptr; set_dump_tensor_enabled_func_ = nullptr; - set_platform_l2_perf_base_func_ = nullptr; + set_platform_l2_swimlane_base_func_ = nullptr; set_l2_swimlane_enabled_func_ = nullptr; set_platform_pmu_base_func_ = nullptr; set_pmu_enabled_func_ = nullptr; @@ -939,8 +939,8 @@ int DeviceRunner::finalize() { } // Cleanup all profiling subsystems. - if (l2_perf_collector_.is_initialized()) { - l2_perf_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); + if (l2_swimlane_collector_.is_initialized()) { + l2_swimlane_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); } if (dump_collector_.is_initialized()) { dump_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); @@ -1119,8 +1119,8 @@ uint64_t DeviceRunner::upload_chip_callable_buffer(const ChipCallable *callable) // ============================================================================= void DeviceRunner::finalize_collectors() { - if (l2_perf_collector_.is_initialized()) { - l2_perf_collector_.stop(); + if (l2_swimlane_collector_.is_initialized()) { + l2_swimlane_collector_.stop(); } if (dump_collector_.is_initialized()) { dump_collector_.stop(); @@ -1130,14 +1130,15 @@ void DeviceRunner::finalize_collectors() { } } -int DeviceRunner::init_l2_perf(int num_aicore, int device_id) { - int rc = l2_perf_collector_.initialize( - num_aicore, device_id, l2_perf_level_, prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, output_prefix_ +int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) { + int rc = l2_swimlane_collector_.initialize( + num_aicore, device_id, l2_swimlane_level_, prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, output_prefix_ ); if (rc == 0) { - kernel_args_.l2_perf_data_base = reinterpret_cast(l2_perf_collector_.get_l2_perf_setup_device_ptr()); - kernel_args_.aicore_l2_perf_ring_addrs = - reinterpret_cast(l2_perf_collector_.get_aicore_ring_addrs_device_ptr()); + kernel_args_.l2_swimlane_data_base = + reinterpret_cast(l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr()); + kernel_args_.aicore_l2_swimlane_ring_addrs = + reinterpret_cast(l2_swimlane_collector_.get_aicore_ring_addrs_device_ptr()); } return rc; } diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 57caec5e7..2899d35a2 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -47,12 +47,12 @@ #include "common/core_type.h" #include "common/kernel_args.h" #include "common/memory_barrier.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "common/unified_log.h" #include "host/function_cache.h" #include "host/memory_allocator.h" -#include "host/l2_perf_collector.h" +#include "host/l2_swimlane_collector.h" #include "host/pmu_collector.h" #include "host/scope_stats_collector.h" #include "host/tensor_dump_collector.h" @@ -184,8 +184,8 @@ class DeviceRunner { * Runtime struct / run() arg list so all three travel the same way. */ void set_l2_swimlane_enabled(int level) { - l2_perf_level_ = static_cast(level); - enable_l2_swimlane_ = (l2_perf_level_ != L2PerfLevel::DISABLED); + l2_swimlane_level_ = static_cast(level); + enable_l2_swimlane_ = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED); } void set_dump_tensor_enabled(bool enable) { enable_dump_tensor_ = enable; } void set_pmu_enabled(int enable_pmu) { @@ -193,7 +193,7 @@ class DeviceRunner { pmu_event_type_ = resolve_pmu_event_type(enable_pmu); } void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; } - // Directory under which all diagnostic artifacts (l2_perf_records.json / + // Directory under which all diagnostic artifacts (l2_swimlane_records.json / // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic // is enabled; CallConfig::validate() enforces this contract upstream. void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; } @@ -377,7 +377,7 @@ class DeviceRunner { void (*set_platform_dump_base_func_)(uint64_t){nullptr}; void (*set_platform_pmu_base_func_)(uint64_t){nullptr}; void (*set_dump_tensor_enabled_func_)(bool){nullptr}; - void (*set_platform_l2_perf_base_func_)(uint64_t){nullptr}; + void (*set_platform_l2_swimlane_base_func_)(uint64_t){nullptr}; void (*set_l2_swimlane_enabled_func_)(bool){nullptr}; void (*set_pmu_enabled_func_)(bool){nullptr}; void (*set_scope_stats_enabled_func_)(bool){nullptr}; @@ -386,7 +386,7 @@ class DeviceRunner { std::string aicore_so_path_; // Performance profiling - L2PerfCollector l2_perf_collector_; + L2SwimlaneCollector l2_swimlane_collector_; // Tensor dump (independent from profiling) TensorDumpCollector dump_collector_; @@ -417,7 +417,7 @@ class DeviceRunner { * @param device_id Device ID (ignored in simulation) * @return 0 on success, error code on failure */ - int init_l2_perf(int num_aicore, int device_id); + int init_l2_swimlane(int num_aicore, int device_id); /** * Initialize tensor dump for simulation. @@ -439,9 +439,9 @@ class DeviceRunner { bool enable_dump_tensor_{false}; bool enable_pmu_{false}; bool enable_scope_stats_{false}; - L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() - PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() - std::string output_prefix_{}; // diagnostic artifact root directory + L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() + PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() + std::string output_prefix_{}; // diagnostic artifact root directory int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id); int init_scope_stats(int num_threads); diff --git a/src/a5/platform/src/aicpu/l2_perf_collector_aicpu.cpp b/src/a5/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp similarity index 64% rename from src/a5/platform/src/aicpu/l2_perf_collector_aicpu.cpp rename to src/a5/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp index fdf16986c..877383642 100644 --- a/src/a5/platform/src/aicpu/l2_perf_collector_aicpu.cpp +++ b/src/a5/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp @@ -10,15 +10,15 @@ */ /** - * @file l2_perf_collector_aicpu.cpp + * @file l2_swimlane_collector_aicpu.cpp * @brief AICPU performance data collection implementation (SPSC free queue) * - * Uses per-core L2PerfBufferState with SPSC free queues for O(1) buffer switching. + * Uses per-core L2SwimlaneAicpuTaskPool with SPSC free queues for O(1) buffer switching. * Host memory manager dynamically allocates replacement buffers and pushes * them into the free_queue. Device pops from free_queue when switching. */ -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include #include @@ -29,58 +29,60 @@ #include "common/unified_log.h" // Cached pointers for hot-path access (set during init) -static AicpuPhaseHeader *s_phase_header = nullptr; -static L2PerfDataHeader *s_l2_perf_header = nullptr; +static L2SwimlaneAicpuPhaseHeader *s_l2_swimlane_aicpu_phase_header = nullptr; +static L2SwimlaneDataHeader *s_l2_swimlane_header = nullptr; -// Per-core L2PerfBufferState cache -static L2PerfBufferState *s_perf_buffer_states[PLATFORM_MAX_CORES] = {}; +// Per-core L2SwimlaneAicpuTaskPool cache +static L2SwimlaneAicpuTaskPool *s_aicpu_task_pools[PLATFORM_MAX_CORES] = {}; -// Per-core L2PerfAicoreRing cache (stable for the run; AICPU reads, AICore writes) -static L2PerfAicoreRing *s_perf_aicore_rings[PLATFORM_MAX_CORES] = {}; +// Per-core L2SwimlaneAicoreRing cache (stable for the run; AICPU reads, AICore writes) +static L2SwimlaneAicoreRing *s_perf_aicore_rings[PLATFORM_MAX_CORES] = {}; // Per-core cached current-records-buffer pointer. Written by AICPU when // rotating buffers from inside `complete_record`; AICPU never publishes this // to AICore (AICore only sees the stable ring). -static L2PerfBuffer *s_perf_records_buffers[PLATFORM_MAX_CORES] = {}; +static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORES] = {}; -// Per-thread PhaseBufferState cache -static PhaseBufferState *s_phase_buffer_states[PLATFORM_MAX_AICPU_THREADS] = {}; -static PhaseBuffer *s_current_phase_buf[PLATFORM_MAX_AICPU_THREADS] = {}; +// Per-thread L2SwimlaneAicpuPhasePool cache +static L2SwimlaneAicpuPhasePool *s_aicpu_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {}; +static L2SwimlaneAicpuPhaseBuffer *s_current_aicpu_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; static int s_orch_thread_idx = -1; -// L2 perf platform state. Published by the host (via dlsym'd setters on sim) +// L2 swimlane platform state. Published by the host (via dlsym'd setters on sim) // or by the AICPU kernel entry (onboard) before perf init runs, so downstream // perf code can discover enablement + device-base without reading the generic // Runtime struct. Two channels (mirrors PMU): // - g_enable_l2_swimlane (bool) — set at kernel entry from the bitmask bit -// - g_l2_perf_level (L2PerfLevel) — promoted in l2_perf_aicpu_init from the +// - g_l2_swimlane_level (L2SwimlaneLevel) — promoted in l2_swimlane_aicpu_init from the // shared-memory header so `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` -// gates have the granular value (exposed via get_l2_perf_level()). -static uint64_t g_platform_l2_perf_base = 0; +// gates have the granular value (exposed via get_l2_swimlane_level()). +static uint64_t g_platform_l2_swimlane_base = 0; static bool g_enable_l2_swimlane = false; -static L2PerfLevel g_l2_perf_level = L2PerfLevel::DISABLED; +static L2SwimlaneLevel g_l2_swimlane_level = L2SwimlaneLevel::DISABLED; -extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base) { g_platform_l2_perf_base = l2_perf_data_base; } -extern "C" uint64_t get_platform_l2_perf_base() { return g_platform_l2_perf_base; } +extern "C" void set_platform_l2_swimlane_base(uint64_t l2_swimlane_data_base) { + g_platform_l2_swimlane_base = l2_swimlane_data_base; +} +extern "C" uint64_t get_platform_l2_swimlane_base() { return g_platform_l2_swimlane_base; } extern "C" void set_l2_swimlane_enabled(bool enable) { g_enable_l2_swimlane = enable; } extern "C" bool is_l2_swimlane_enabled() { return g_enable_l2_swimlane; } -L2PerfLevel get_l2_perf_level() { return g_l2_perf_level; } +L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; } /** * Enqueue ready buffer to per-thread queue * - * @param header L2PerfDataHeader pointer + * @param header L2SwimlaneDataHeader pointer * @param thread_idx Thread index * @param core_index Core index (or thread_idx for phase entries) * @param buffer_ptr Device pointer to the full buffer * @param buffer_seq Sequence number for ordering - * @param is_phase 0 = L2PerfRecord, 1 = Phase + * @param kind Buffer kind discriminator (see L2SwimlaneBufferKind) * @return 0 on success, -1 if queue full */ static int enqueue_ready_buffer( - L2PerfDataHeader *header, int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq, - uint32_t is_phase + L2SwimlaneDataHeader *header, int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq, + L2SwimlaneBufferKind kind ) { uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE; uint32_t current_tail = header->queue_tails[thread_idx]; @@ -93,7 +95,7 @@ static int enqueue_ready_buffer( } header->queues[thread_idx][current_tail].core_index = core_index; - header->queues[thread_idx][current_tail].is_phase = is_phase; + header->queues[thread_idx][current_tail].kind = kind; header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr; header->queues[thread_idx][current_tail].buffer_seq = buffer_seq; header->queue_tails[thread_idx] = next_tail; @@ -101,33 +103,33 @@ static int enqueue_ready_buffer( return 0; } -void l2_perf_aicpu_init(int worker_count) { - void *l2_perf_base = reinterpret_cast(g_platform_l2_perf_base); - if (l2_perf_base == nullptr) { - LOG_ERROR("l2_perf_data_base is NULL, cannot initialize profiling"); +void l2_swimlane_aicpu_init(int worker_count) { + void *l2_swimlane_base = reinterpret_cast(g_platform_l2_swimlane_base); + if (l2_swimlane_base == nullptr) { + LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize profiling"); return; } - s_l2_perf_header = get_l2_perf_header(l2_perf_base); + s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base); // Read the granular perf_level from the shared-memory header (host wrote - // it in L2PerfCollector::initialize). The kernel-entry setter only seeded + // it in L2SwimlaneCollector::initialize). The kernel-entry setter only seeded // the binary g_enable_l2_swimlane via the bitmask bit. - g_l2_perf_level = static_cast(s_l2_perf_header->l2_perf_level); + g_l2_swimlane_level = static_cast(s_l2_swimlane_header->l2_swimlane_level); LOG_INFO_V0( - "Initializing performance profiling for %d cores (memcpy-based), l2_perf_level=%u", worker_count, - static_cast(g_l2_perf_level) + "Initializing performance profiling for %d cores (memcpy-based), l2_swimlane_level=%u", worker_count, + static_cast(g_l2_swimlane_level) ); // Pop first buffer from free_queue for each core, and cache the stable // AICore staging ring pointer so complete_record can read it without // touching SHM. for (int i = 0; i < worker_count; i++) { - L2PerfBufferState *state = get_perf_buffer_state(l2_perf_base, i); + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(l2_swimlane_base, i); - s_perf_buffer_states[i] = state; - s_perf_aicore_rings[i] = reinterpret_cast(state->aicore_ring_ptr); + s_aicpu_task_pools[i] = state; + s_perf_aicore_rings[i] = reinterpret_cast(state->aicore_ring_ptr); // Pop first buffer from free_queue rmb(); @@ -142,15 +144,15 @@ void l2_perf_aicpu_init(int worker_count) { state->current_buf_seq = 0; wmb(); - L2PerfBuffer *buf = reinterpret_cast(buf_ptr); + L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast(buf_ptr); buf->count = 0; - s_perf_records_buffers[i] = buf; + s_current_aicpu_task_buffers[i] = buf; LOG_DEBUG("Core %d: popped initial buffer (addr=0x%lx)", i, buf_ptr); } else { LOG_ERROR("Core %d: free_queue is empty during init!", i); state->current_buf_ptr = 0; - s_perf_records_buffers[i] = nullptr; + s_current_aicpu_task_buffers[i] = nullptr; } } @@ -160,18 +162,18 @@ void l2_perf_aicpu_init(int worker_count) { } /** - * Internal records-buffer rotation. Called from `l2_perf_aicpu_complete_record` + * Internal records-buffer rotation. Called from `l2_swimlane_aicpu_complete_task` * after a record is committed and the buffer hits capacity. Only swaps an * AICPU-private records pointer — AICore reads from a stable ring and is * unaffected by this call. */ static void switch_records_buffer(int core_id, int thread_idx) { - L2PerfBufferState *state = s_perf_buffer_states[core_id]; + L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id]; if (state == nullptr) { return; } - L2PerfBuffer *full_buf = s_perf_records_buffers[core_id]; + L2SwimlaneAicpuTaskBuffer *full_buf = s_current_aicpu_task_buffers[core_id]; if (full_buf == nullptr) { return; } @@ -194,7 +196,9 @@ static void switch_records_buffer(int core_id, int thread_idx) { // Enqueue full buffer to ReadyQueue uint32_t seq = state->current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, state->current_buf_ptr, seq, 0); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, core_id, state->current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask + ); if (rc != 0) { LOG_ERROR("Thread %d: Core %d failed to enqueue buffer (queue full), data lost!", thread_idx, core_id); // Revert: discard data and keep writing @@ -212,25 +216,25 @@ static void switch_records_buffer(int core_id, int thread_idx) { state->current_buf_seq = seq + 1; wmb(); - L2PerfBuffer *new_buf = reinterpret_cast(new_buf_ptr); + L2SwimlaneAicpuTaskBuffer *new_buf = reinterpret_cast(new_buf_ptr); new_buf->count = 0; - s_perf_records_buffers[core_id] = new_buf; + s_current_aicpu_task_buffers[core_id] = new_buf; LOG_INFO_V0("Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr); } -int l2_perf_aicpu_complete_record( +int l2_swimlane_aicpu_complete_task( int core_id, int thread_idx, uint32_t expected_reg_task_id, uint64_t task_id, uint32_t func_id, CoreType core_type, uint64_t dispatch_time, uint64_t finish_time, const uint64_t *fanout, int32_t fanout_count ) { if (core_id < 0 || core_id >= PLATFORM_MAX_CORES) { return -1; } - L2PerfBufferState *state = s_perf_buffer_states[core_id]; + L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id]; if (state == nullptr) { return -1; } - L2PerfAicoreRing *ring = s_perf_aicore_rings[core_id]; + L2SwimlaneAicoreRing *ring = s_perf_aicore_rings[core_id]; if (ring == nullptr) { return -1; } @@ -239,14 +243,14 @@ int l2_perf_aicpu_complete_record( // `device_total - (collected + dropped + mismatch)`. state->total_record_count += 1; - L2PerfBuffer *l2_perf_buf = s_perf_records_buffers[core_id]; - if (l2_perf_buf == nullptr) { + L2SwimlaneAicpuTaskBuffer *l2_swimlane_buf = s_current_aicpu_task_buffers[core_id]; + if (l2_swimlane_buf == nullptr) { // No active records buffer (init ran out of free buffers); count as drop // so host reconciliation stays consistent. state->dropped_record_count += 1; return -1; } - uint32_t count = l2_perf_buf->count; + uint32_t count = l2_swimlane_buf->count; if (count >= PLATFORM_PROF_BUFFER_SIZE) { // Defensive: should not happen because we rotate at end of every commit. state->dropped_record_count += 1; @@ -254,8 +258,8 @@ int l2_perf_aicpu_complete_record( } // Read AICore-published timing from the per-core staging ring. - L2PerfRecord *slot = &ring->dual_issue_slots[expected_reg_task_id % PLATFORM_L2_AICORE_RING_SIZE]; - // One PoC cache line: matches AICore l2_perf_aicore_record_task() dcci(..., SINGLE_CACHE_LINE, ...) + L2SwimlaneAicpuTaskRecord *slot = &ring->dual_issue_slots[expected_reg_task_id % PLATFORM_L2_AICORE_RING_SIZE]; + // One PoC cache line: matches AICore l2_swimlane_aicore_record_task() dcci(..., SINGLE_CACHE_LINE, ...) // and aicpu/cache_ops.cpp step size; timing fields live in the first line. cache_invalidate_range(slot, 64); if (static_cast(slot->task_id) != expected_reg_task_id) { @@ -266,7 +270,7 @@ int l2_perf_aicpu_complete_record( // dcci before signaling). Surface separately from capacity drops. state->mismatch_record_count += 1; LOG_ERROR( - "L2Perf invariant violated: core %d slot task_id=0x%x expected=0x%x " + "L2Swimlane invariant violated: core %d slot task_id=0x%x expected=0x%x " "(completion-before-dispatch broken or ring undersized)", core_id, static_cast(slot->task_id), expected_reg_task_id ); @@ -274,7 +278,7 @@ int l2_perf_aicpu_complete_record( } // Copy AICore timing to committed record slot - L2PerfRecord *record = &l2_perf_buf->records[count]; + L2SwimlaneAicpuTaskRecord *record = &l2_swimlane_buf->records[count]; record->start_time = slot->start_time; record->end_time = slot->end_time; @@ -284,7 +288,7 @@ int l2_perf_aicpu_complete_record( record->core_type = core_type; // AICPU_TIMING and above: dispatch/finish timing and fanout dependency info - if (g_l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (g_l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { record->dispatch_time = dispatch_time; record->finish_time = finish_time; if (fanout != nullptr && fanout_count > 0) { @@ -303,7 +307,7 @@ int l2_perf_aicpu_complete_record( } uint32_t new_count = count + 1; - l2_perf_buf->count = new_count; + l2_swimlane_buf->count = new_count; wmb(); // Rotate after the write so the just-committed record is preserved. @@ -315,7 +319,7 @@ int l2_perf_aicpu_complete_record( return 0; } -void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, int core_num) { +void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int core_num) { if (!is_l2_swimlane_enabled()) { return; } @@ -328,7 +332,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; - L2PerfBufferState *state = s_perf_buffer_states[core_id]; + L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id]; if (state == nullptr) continue; rmb(); @@ -338,18 +342,20 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in continue; } - L2PerfBuffer *buf = reinterpret_cast(buf_ptr); + L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast(buf_ptr); if (buf->count == 0) { continue; } uint32_t seq = state->current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, buf_ptr, seq, 0); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, core_id, buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask + ); if (rc == 0) { LOG_INFO_V0("Thread %d: Core %d flushed buffer with %u records", thread_idx, core_id, buf->count); flushed_count++; state->current_buf_ptr = 0; - s_perf_records_buffers[core_id] = nullptr; + s_current_aicpu_task_buffers[core_id] = nullptr; wmb(); } else { // ready_queue full at end-of-run: account the loss and clear the @@ -362,7 +368,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in state->dropped_record_count += buf->count; buf->count = 0; state->current_buf_ptr = 0; - s_perf_records_buffers[core_id] = nullptr; + s_current_aicpu_task_buffers[core_id] = nullptr; wmb(); } } @@ -372,22 +378,24 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in LOG_INFO_V0("Thread %d: Performance buffer flush complete, %d buffers flushed", thread_idx, flushed_count); } -void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) { - void *l2_perf_base = reinterpret_cast(g_platform_l2_perf_base); - if (l2_perf_base == nullptr) { - LOG_ERROR("l2_perf_data_base is NULL, cannot initialize phase profiling"); +void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads) { + void *l2_swimlane_base = reinterpret_cast(g_platform_l2_swimlane_base); + if (l2_swimlane_base == nullptr) { + LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize phase profiling"); return; } - s_phase_header = get_phase_header(l2_perf_base, worker_count); - s_l2_perf_header = get_l2_perf_header(l2_perf_base); + s_l2_swimlane_aicpu_phase_header = get_phase_header(l2_swimlane_base, worker_count); + s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base); - s_phase_header->magic = AICPU_PHASE_MAGIC; - s_phase_header->num_sched_threads = num_sched_threads; - s_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD; - s_phase_header->num_cores = 0; + s_l2_swimlane_aicpu_phase_header->magic = L2_SWIMLANE_AICPU_PHASE_MAGIC; + s_l2_swimlane_aicpu_phase_header->num_sched_threads = num_sched_threads; + s_l2_swimlane_aicpu_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD; + s_l2_swimlane_aicpu_phase_header->num_cores = 0; - memset(s_phase_header->core_to_thread, -1, sizeof(s_phase_header->core_to_thread)); + memset( + s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread) + ); // Cache per-thread record pointers and clear buffers // Include all threads: scheduler + orchestrator (orchestrators may become schedulers) @@ -396,9 +404,9 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) { total_threads = PLATFORM_MAX_AICPU_THREADS; } for (int t = 0; t < total_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(l2_perf_base, worker_count, t); + L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(l2_swimlane_base, worker_count, t); - s_phase_buffer_states[t] = state; + s_aicpu_phase_pools[t] = state; // Pop first buffer from free_queue rmb(); @@ -413,22 +421,22 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) { state->current_buf_seq = 0; wmb(); - PhaseBuffer *buf = reinterpret_cast(buf_ptr); + L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast(buf_ptr); buf->count = 0; - s_current_phase_buf[t] = buf; + s_current_aicpu_phase_buffers[t] = buf; LOG_DEBUG("Thread %d: popped initial phase buffer (addr=0x%lx)", t, buf_ptr); } else { LOG_ERROR("Thread %d: phase free_queue is empty during init!", t); state->current_buf_ptr = 0; - s_current_phase_buf[t] = nullptr; + s_current_aicpu_phase_buffers[t] = nullptr; } } // Clear remaining slots for (int t = total_threads; t < PLATFORM_MAX_AICPU_THREADS; t++) { - s_phase_buffer_states[t] = nullptr; - s_current_phase_buf[t] = nullptr; + s_aicpu_phase_pools[t] = nullptr; + s_current_aicpu_phase_buffers[t] = nullptr; } wmb(); @@ -443,21 +451,23 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) { * Switch phase buffer when current buffer is full (free queue version) * * Enqueues the full buffer to ReadyQueue and pops the next buffer from free_queue. - * If no free buffer is available, sets s_current_phase_buf to nullptr so subsequent + * If no free buffer is available, sets s_current_aicpu_phase_buffers to nullptr so subsequent * records are dropped (preserving already-enqueued data). */ static void switch_phase_buffer(int thread_idx) { - PhaseBufferState *state = s_phase_buffer_states[thread_idx]; + L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx]; if (state == nullptr) return; - PhaseBuffer *full_buf = s_current_phase_buf[thread_idx]; + L2SwimlaneAicpuPhaseBuffer *full_buf = s_current_aicpu_phase_buffers[thread_idx]; if (full_buf == nullptr) return; LOG_INFO_V0("Thread %d: phase buffer is full (count=%u)", thread_idx, full_buf->count); // Enqueue to ReadyQueue uint32_t seq = state->current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, thread_idx, state->current_buf_ptr, seq, 1); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, thread_idx, state->current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuPhase + ); if (rc != 0) { LOG_ERROR("Thread %d: failed to enqueue phase buffer (queue full), discarding data", thread_idx); // Treat the entire un-enqueued buffer as dropped to keep the @@ -482,29 +492,29 @@ static void switch_phase_buffer(int thread_idx) { state->current_buf_seq = seq + 1; wmb(); - PhaseBuffer *new_buf = reinterpret_cast(new_buf_ptr); + L2SwimlaneAicpuPhaseBuffer *new_buf = reinterpret_cast(new_buf_ptr); new_buf->count = 0; - s_current_phase_buf[thread_idx] = new_buf; + s_current_aicpu_phase_buffers[thread_idx] = new_buf; LOG_INFO_V0("Thread %d: switched to new phase buffer", thread_idx); } else { // No free buffer available, drop subsequent records LOG_WARN("Thread %d: no free phase buffer available, dropping records until Host catches up", thread_idx); - s_current_phase_buf[thread_idx] = nullptr; + s_current_aicpu_phase_buffers[thread_idx] = nullptr; state->current_buf_ptr = 0; wmb(); } } -void l2_perf_aicpu_record_phase( - int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, +void l2_swimlane_aicpu_record_phase( + int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, uint64_t tasks_processed, uint32_t extra1, uint32_t extra2 ) { - if (s_phase_header == nullptr) { + if (s_l2_swimlane_aicpu_phase_header == nullptr) { return; } - PhaseBufferState *state = s_phase_buffer_states[thread_idx]; + L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx]; if (state == nullptr) { return; } @@ -513,7 +523,7 @@ void l2_perf_aicpu_record_phase( // as `device_total - (collected + dropped)` (mirrors PERF accounting). state->total_record_count += 1; - PhaseBuffer *buf = s_current_phase_buf[thread_idx]; + L2SwimlaneAicpuPhaseBuffer *buf = s_current_aicpu_phase_buffers[thread_idx]; // Try to recover from nullptr (no buffer was available on previous switch) if (buf == nullptr) { @@ -529,9 +539,9 @@ void l2_perf_aicpu_record_phase( state->current_buf_seq = state->current_buf_seq + 1; wmb(); - buf = reinterpret_cast(buf_ptr); + buf = reinterpret_cast(buf_ptr); buf->count = 0; - s_current_phase_buf[thread_idx] = buf; + s_current_aicpu_phase_buffers[thread_idx] = buf; LOG_INFO_V0("Thread %d: recovered phase buffer", thread_idx); } @@ -546,7 +556,7 @@ void l2_perf_aicpu_record_phase( if (idx >= PLATFORM_PHASE_RECORDS_PER_THREAD) { // Buffer full, switch to next buffer switch_phase_buffer(thread_idx); - buf = s_current_phase_buf[thread_idx]; + buf = s_current_aicpu_phase_buffers[thread_idx]; if (buf == nullptr) { state->dropped_record_count += 1; return; @@ -558,7 +568,7 @@ void l2_perf_aicpu_record_phase( } } - AicpuPhaseRecord *record = &buf->records[idx]; + L2SwimlaneAicpuPhaseRecord *record = &buf->records[idx]; record->start_time = start_time; record->end_time = end_time; record->loop_iter = loop_iter; @@ -570,21 +580,21 @@ void l2_perf_aicpu_record_phase( buf->count = idx + 1; } -void l2_perf_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thread_idx; } +void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thread_idx; } -void l2_perf_aicpu_record_orch_phase( - AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id +void l2_swimlane_aicpu_record_orch_phase( + L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id ) { - if (s_orch_thread_idx < 0 || s_phase_header == nullptr) return; - l2_perf_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id); + if (s_orch_thread_idx < 0 || s_l2_swimlane_aicpu_phase_header == nullptr) return; + l2_swimlane_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id); } -void l2_perf_aicpu_flush_phase_buffers(int thread_idx) { - if (s_phase_header == nullptr || s_l2_perf_header == nullptr) { +void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) { + if (s_l2_swimlane_aicpu_phase_header == nullptr || s_l2_swimlane_header == nullptr) { return; } - PhaseBufferState *state = s_phase_buffer_states[thread_idx]; + L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx]; if (state == nullptr) return; rmb(); @@ -594,13 +604,15 @@ void l2_perf_aicpu_flush_phase_buffers(int thread_idx) { return; } - PhaseBuffer *buf = reinterpret_cast(buf_ptr); + L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast(buf_ptr); if (buf->count == 0) { return; } uint32_t seq = state->current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, thread_idx, buf_ptr, seq, 1); + int rc = enqueue_ready_buffer( + s_l2_swimlane_header, thread_idx, thread_idx, buf_ptr, seq, L2SwimlaneBufferKind::AicpuPhase + ); if (rc == 0) { LOG_INFO_V0("Thread %d: flushed phase buffer with %u records", thread_idx, buf->count); } else { @@ -609,28 +621,30 @@ void l2_perf_aicpu_flush_phase_buffers(int thread_idx) { buf->count = 0; } state->current_buf_ptr = 0; - s_current_phase_buf[thread_idx] = nullptr; + s_current_aicpu_phase_buffers[thread_idx] = nullptr; wmb(); } -void l2_perf_aicpu_init_core_assignments(int total_cores) { - if (s_phase_header == nullptr) { +void l2_swimlane_aicpu_init_core_assignments(int total_cores) { + if (s_l2_swimlane_aicpu_phase_header == nullptr) { return; } - memset(s_phase_header->core_to_thread, -1, sizeof(s_phase_header->core_to_thread)); - s_phase_header->num_cores = static_cast(total_cores); + memset( + s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread) + ); + s_l2_swimlane_aicpu_phase_header->num_cores = static_cast(total_cores); wmb(); LOG_INFO_V0("Core-to-thread mapping init: %d cores", total_cores); } -void l2_perf_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) { - if (s_phase_header == nullptr) { +void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) { + if (s_l2_swimlane_aicpu_phase_header == nullptr) { return; } for (int i = 0; i < core_num; i++) { int core_id = core_ids[i]; if (core_id >= 0 && core_id < PLATFORM_MAX_CORES) { - s_phase_header->core_to_thread[core_id] = static_cast(thread_idx); + s_l2_swimlane_aicpu_phase_header->core_to_thread[core_id] = static_cast(thread_idx); } } wmb(); diff --git a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp index 33c04f783..15eb4405a 100644 --- a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp +++ b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp @@ -13,7 +13,7 @@ * @file tensor_dump_aicpu.cpp * @brief AICPU tensor dump collection implementation * - * Mirrors l2_perf_collector_aicpu.cpp patterns: + * Mirrors l2_swimlane_collector_aicpu.cpp patterns: * - Per-thread DumpBufferState with SPSC free queues * - Per-thread ready queue for handing off full metadata buffers * - Per-thread circular arena for tensor payload data diff --git a/src/a5/platform/src/host/l2_perf_collector.cpp b/src/a5/platform/src/host/l2_swimlane_collector.cpp similarity index 80% rename from src/a5/platform/src/host/l2_perf_collector.cpp rename to src/a5/platform/src/host/l2_swimlane_collector.cpp index 028d374a6..e57c8ed8e 100644 --- a/src/a5/platform/src/host/l2_perf_collector.cpp +++ b/src/a5/platform/src/host/l2_swimlane_collector.cpp @@ -10,21 +10,21 @@ */ /** - * @file l2_perf_collector.cpp + * @file l2_swimlane_collector.cpp * @brief Performance data collector implementation. The mgmt-thread + * buffer-pool machinery lives in profiling_common::BufferPoolManager - * parameterized by L2PerfModule (host/l2_perf_collector.h); the + * parameterized by L2SwimlaneModule (host/l2_swimlane_collector.h); the * poll loop lives in profiling_common::ProfilerBase. This file * owns the per-buffer on_buffer_collected callback and the export * logic. * * a5 specifics: device↔host transfers go through profiling_copy.h. The * framework's mgmt loop mirrors the shm region per tick; per-buffer - * payloads (L2PerfBuffer / PhaseBuffer) are pulled on demand inside + * payloads (L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer) are pulled on demand inside * ProfilerAlgorithms. */ -#include "host/l2_perf_collector.h" +#include "host/l2_swimlane_collector.h" #include #include @@ -43,7 +43,7 @@ #include "host/profiling_copy.h" // ============================================================================= -// L2PerfCollector Implementation +// L2SwimlaneCollector Implementation // ============================================================================= /** @@ -51,18 +51,18 @@ * Scheduler phases: SCHED_COMPLETE(0), SCHED_DISPATCH(1), SCHED_SCAN(2), SCHED_IDLE_WAIT(3) * Orchestrator phases: ORCH_SYNC(16) through ORCH_SCOPE_END(24) */ -static bool is_scheduler_phase(AicpuPhaseId id) { - return static_cast(id) < static_cast(AicpuPhaseId::SCHED_PHASE_COUNT); +static bool is_scheduler_phase(L2SwimlaneAicpuPhaseId id) { + return static_cast(id) < static_cast(L2SwimlaneAicpuPhaseId::SCHED_PHASE_COUNT); } -L2PerfCollector::~L2PerfCollector() { +L2SwimlaneCollector::~L2SwimlaneCollector() { stop(); if (shm_host_ != nullptr) { - LOG_WARN("L2PerfCollector destroyed without finalize()"); + LOG_WARN("L2SwimlaneCollector destroyed without finalize()"); } } -void *L2PerfCollector::alloc_single_buffer(size_t size, void **host_ptr_out) { +void *L2SwimlaneCollector::alloc_single_buffer(size_t size, void **host_ptr_out) { void *dev_ptr = alloc_cb_(size); if (dev_ptr == nullptr) { LOG_ERROR("Failed to allocate buffer (%zu bytes)", size); @@ -98,12 +98,12 @@ void *L2PerfCollector::alloc_single_buffer(size_t size, void **host_ptr_out) { return dev_ptr; } -int L2PerfCollector::initialize( - int num_aicore, int device_id, L2PerfLevel l2_perf_level, const L2PerfAllocCallback &alloc_cb, - L2PerfRegisterCallback register_cb, const L2PerfFreeCallback &free_cb, const std::string &output_prefix +int L2SwimlaneCollector::initialize( + int num_aicore, int device_id, L2SwimlaneLevel l2_swimlane_level, const L2SwimlaneAllocCallback &alloc_cb, + L2SwimlaneRegisterCallback register_cb, const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix ) { if (shm_host_ != nullptr) { - LOG_ERROR("L2PerfCollector already initialized"); + LOG_ERROR("L2SwimlaneCollector already initialized"); return -1; } @@ -115,7 +115,7 @@ int L2PerfCollector::initialize( } num_aicore_ = num_aicore; - l2_perf_level_ = l2_perf_level; + l2_swimlane_level_ = l2_swimlane_level; output_prefix_ = output_prefix; total_perf_collected_ = 0; total_phase_collected_ = 0; @@ -135,9 +135,9 @@ int L2PerfCollector::initialize( LOG_DEBUG("Shared memory allocation plan:"); LOG_DEBUG(" Number of cores: %d", num_aicore); - LOG_DEBUG(" Header size: %zu bytes", sizeof(L2PerfDataHeader)); - LOG_DEBUG(" L2PerfBufferState size: %zu bytes each", sizeof(L2PerfBufferState)); - LOG_DEBUG(" PhaseBufferState size: %zu bytes each", sizeof(PhaseBufferState)); + LOG_DEBUG(" Header size: %zu bytes", sizeof(L2SwimlaneDataHeader)); + LOG_DEBUG(" L2SwimlaneAicpuTaskPool size: %zu bytes each", sizeof(L2SwimlaneAicpuTaskPool)); + LOG_DEBUG(" L2SwimlaneAicpuPhasePool size: %zu bytes each", sizeof(L2SwimlaneAicpuPhasePool)); LOG_DEBUG(" Total shared memory: %zu bytes (%zu KB)", total_size, total_size / 1024); // Step 2: Allocate shared memory + paired host shadow @@ -151,21 +151,21 @@ int L2PerfCollector::initialize( // Step 3: Initialize header on host shadow std::memset(perf_host_ptr, 0, total_size); - L2PerfDataHeader *header = get_l2_perf_header(perf_host_ptr); + L2SwimlaneDataHeader *header = get_l2_swimlane_header(perf_host_ptr); for (int t = 0; t < PLATFORM_MAX_AICPU_THREADS; t++) { header->queue_heads[t] = 0; header->queue_tails[t] = 0; } header->num_cores = num_aicore; - header->l2_perf_level = static_cast(l2_perf_level_); + header->l2_swimlane_level = static_cast(l2_swimlane_level_); - LOG_DEBUG("Initialized L2PerfDataHeader:"); + LOG_DEBUG("Initialized L2SwimlaneDataHeader:"); LOG_DEBUG(" num_cores: %d", header->num_cores); - LOG_DEBUG(" l2_perf_level: %u", header->l2_perf_level); + LOG_DEBUG(" l2_swimlane_level: %u", header->l2_swimlane_level); LOG_DEBUG(" buffer_capacity: %d", PLATFORM_PROF_BUFFER_SIZE); LOG_DEBUG(" queue capacity: %d", PLATFORM_PROF_READYQUEUE_SIZE); - // Step 4: Allocate per-core stable L2PerfAicoreRings + the address-table + // Step 4: Allocate per-core stable L2SwimlaneAicoreRings + the address-table // buffer. Rings are allocated once and never rotated; AICore writes into // them at task time, AICPU reads at FIN time. The address-table mirrors // each ring's device pointer so the AICore-side `KernelArgs` machinery @@ -175,23 +175,23 @@ int L2PerfCollector::initialize( size_t table_size = static_cast(num_aicore) * sizeof(uint64_t); void *table_dev_ptr = alloc_single_buffer(table_size, &table_host_ptr); if (table_dev_ptr == nullptr) { - LOG_ERROR("Failed to allocate L2Perf aicore ring address table (%zu bytes)", table_size); + LOG_ERROR("Failed to allocate L2Swimlane aicore ring address table (%zu bytes)", table_size); return -1; } std::memset(table_host_ptr, 0, table_size); aicore_ring_addrs_dev_ = table_dev_ptr; aicore_ring_addrs_host_ = table_host_ptr; - // Step 4b: Initialize L2PerfBufferStates — 1 buffer/core in free_queue, rest to recycled pool. + // Step 4b: Initialize L2SwimlaneAicpuTaskPools — 1 buffer/core in free_queue, rest to recycled pool. for (int i = 0; i < num_aicore; i++) { - L2PerfBufferState *state = get_perf_buffer_state(perf_host_ptr, i); - std::memset(state, 0, sizeof(L2PerfBufferState)); + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(perf_host_ptr, i); + std::memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool)); // Allocate the per-core staging ring (no host shadow needed: AICore // writes, AICPU reads — host never touches the ring directly). - void *ring_dev = alloc_cb(sizeof(L2PerfAicoreRing)); + void *ring_dev = alloc_cb(sizeof(L2SwimlaneAicoreRing)); if (ring_dev == nullptr) { - LOG_ERROR("Failed to allocate L2PerfAicoreRing for core %d", i); + LOG_ERROR("Failed to allocate L2SwimlaneAicoreRing for core %d", i); return -1; } aicore_rings_dev_[i] = ring_dev; @@ -200,22 +200,22 @@ int L2PerfCollector::initialize( for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_CORE; s++) { void *host_buf_ptr = nullptr; - void *dev_buf_ptr = alloc_single_buffer(sizeof(L2PerfBuffer), &host_buf_ptr); + void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuTaskBuffer), &host_buf_ptr); if (dev_buf_ptr == nullptr) { - LOG_ERROR("Failed to allocate L2PerfBuffer for core %d, buffer %d", i, s); + LOG_ERROR("Failed to allocate L2SwimlaneAicpuTaskBuffer for core %d, buffer %d", i, s); return -1; } if (s == 0) { state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); } else { - manager_.push_recycled(static_cast(ProfBufferType::PERF_RECORD), dev_buf_ptr); + manager_.push_recycled(static_cast(ProfBufferType::AICPU_TASK), dev_buf_ptr); } } state->free_queue.tail = 1; } LOG_DEBUG( - "Initialized %d L2PerfBufferStates: 1 buffer/core, %d in recycled pool", num_aicore, + "Initialized %d L2SwimlaneAicpuTaskPools: 1 buffer/core, %d in recycled pool", num_aicore, num_aicore * (PLATFORM_PROF_BUFFERS_PER_CORE - 1) ); @@ -224,21 +224,21 @@ int L2PerfCollector::initialize( // Step 5: Initialize PhaseBufferStates — 1 buffer/thread in free_queue, rest to recycled pool. for (int t = 0; t < num_phase_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t); - std::memset(state, 0, sizeof(PhaseBufferState)); + L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t); + std::memset(state, 0, sizeof(L2SwimlaneAicpuPhasePool)); for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) { void *host_buf_ptr = nullptr; - void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr); + void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuPhaseBuffer), &host_buf_ptr); if (dev_buf_ptr == nullptr) { - LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s); + LOG_ERROR("Failed to allocate L2SwimlaneAicpuPhaseBuffer for thread %d, buffer %d", t, s); return -1; } if (s == 0) { state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); } else { - manager_.push_recycled(static_cast(ProfBufferType::PHASE), dev_buf_ptr); + manager_.push_recycled(static_cast(ProfBufferType::AICPU_PHASE), dev_buf_ptr); } } state->free_queue.tail = 1; @@ -259,7 +259,7 @@ int L2PerfCollector::initialize( collected_perf_records_.assign(num_aicore_, {}); collected_phase_records_.assign(PLATFORM_MAX_AICPU_THREADS, {}); - LOG_DEBUG("L2 perf device base = 0x%lx", reinterpret_cast(perf_dev_ptr)); + LOG_DEBUG("L2 swimlane device base = 0x%lx", reinterpret_cast(perf_dev_ptr)); LOG_INFO_V0("Performance profiling initialized (dynamic buffer mode)"); return 0; } @@ -268,8 +268,8 @@ int L2PerfCollector::initialize( // ProfilerBase callbacks // --------------------------------------------------------------------------- -void L2PerfCollector::copy_perf_buffer(const ReadyBufferInfo &info) { - L2PerfBuffer *buf = reinterpret_cast(info.host_buffer_ptr); +void L2SwimlaneCollector::copy_perf_buffer(const ReadyBufferInfo &info) { + L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast(info.host_buffer_ptr); rmb(); uint32_t count = buf->count; if (count > PLATFORM_PROF_BUFFER_SIZE) { @@ -284,8 +284,8 @@ void L2PerfCollector::copy_perf_buffer(const ReadyBufferInfo &info) { } } -void L2PerfCollector::copy_phase_buffer(const ReadyBufferInfo &info) { - PhaseBuffer *buf = reinterpret_cast(info.host_buffer_ptr); +void L2SwimlaneCollector::copy_phase_buffer(const ReadyBufferInfo &info) { + L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast(info.host_buffer_ptr); rmb(); uint32_t count = buf->count; if (count > static_cast(PLATFORM_PHASE_RECORDS_PER_THREAD)) { @@ -303,8 +303,8 @@ void L2PerfCollector::copy_phase_buffer(const ReadyBufferInfo &info) { } } -void L2PerfCollector::on_buffer_collected(const ReadyBufferInfo &info) { - if (info.type == ProfBufferType::PERF_RECORD) { +void L2SwimlaneCollector::on_buffer_collected(const ReadyBufferInfo &info) { + if (info.type == ProfBufferType::AICPU_TASK) { copy_perf_buffer(info); } else { copy_phase_buffer(info); @@ -320,13 +320,13 @@ void L2PerfCollector::on_buffer_collected(const ReadyBufferInfo &info) { // clear current_buf_ptr on the device side. Host's job here is purely // accounting + sanity check. // -// L2PerfBufferState now tracks total / dropped / mismatch counters — same +// L2SwimlaneAicpuTaskPool now tracks total / dropped / mismatch counters — same // three-bucket accounting as PMU and a2a3. The cross-check equation // (collected + dropped + mismatch == device_total) is enforced per pool // (PERF + PHASE). Empty PHASE pools (runtime emits no phase records) are // skipped via the `optional` flag. -void L2PerfCollector::reconcile_counters() { +void L2SwimlaneCollector::reconcile_counters() { if (shm_host_ == nullptr) return; // Pull the latest BufferStates (current_buf_ptr) before the per-unit @@ -337,7 +337,7 @@ void L2PerfCollector::reconcile_counters() { rmb(); // After stop(), AICPU's per-thread flush hooks - // (l2_perf_aicpu_flush_buffers / l2_perf_aicpu_flush_phase_buffers) + // (l2_swimlane_aicpu_flush / l2_swimlane_aicpu_flush_phase_buffers) // should have either enqueued the active buffer (success → // current_buf_ptr=0) or cleared it on enqueue failure. A non-zero // pointer with non-zero count means records AICPU neither delivered @@ -345,16 +345,16 @@ void L2PerfCollector::reconcile_counters() { // never written) are fine; AICPU's flush legitimately skips them. int leftover_active = 0; for (int i = 0; i < num_aicore_; i++) { - L2PerfBufferState *state = get_perf_buffer_state(shm_host_, i); + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm_host_, i); uint64_t buf_ptr = state->current_buf_ptr; if (buf_ptr == 0) continue; void *host_ptr = manager_.resolve_host_ptr(reinterpret_cast(buf_ptr)); if (host_ptr == nullptr) continue; - profiling_copy_from_device(host_ptr, reinterpret_cast(buf_ptr), sizeof(L2PerfBuffer)); - uint32_t count = reinterpret_cast(host_ptr)->count; + profiling_copy_from_device(host_ptr, reinterpret_cast(buf_ptr), sizeof(L2SwimlaneAicpuTaskBuffer)); + uint32_t count = reinterpret_cast(host_ptr)->count; if (count == 0) continue; LOG_ERROR( - "L2Perf reconcile: core %d has un-flushed PERF buffer (current_buf_ptr=0x%lx, count=%u) " + "L2Swimlane reconcile: core %d has un-flushed PERF buffer (current_buf_ptr=0x%lx, count=%u) " "after stop() — device flush failed", i, static_cast(buf_ptr), count ); @@ -362,16 +362,16 @@ void L2PerfCollector::reconcile_counters() { } for (int t = 0; t < PLATFORM_MAX_AICPU_THREADS; t++) { - PhaseBufferState *state = get_phase_buffer_state(shm_host_, num_aicore_, t); + L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm_host_, num_aicore_, t); uint64_t buf_ptr = state->current_buf_ptr; if (buf_ptr == 0) continue; void *host_ptr = manager_.resolve_host_ptr(reinterpret_cast(buf_ptr)); if (host_ptr == nullptr) continue; - profiling_copy_from_device(host_ptr, reinterpret_cast(buf_ptr), sizeof(PhaseBuffer)); - uint32_t count = reinterpret_cast(host_ptr)->count; + profiling_copy_from_device(host_ptr, reinterpret_cast(buf_ptr), sizeof(L2SwimlaneAicpuPhaseBuffer)); + uint32_t count = reinterpret_cast(host_ptr)->count; if (count == 0) continue; LOG_ERROR( - "L2Perf reconcile: thread %d has un-flushed PHASE buffer (current_buf_ptr=0x%lx, count=%u) " + "L2Swimlane reconcile: thread %d has un-flushed PHASE buffer (current_buf_ptr=0x%lx, count=%u) " "after stop() — device flush failed", t, static_cast(buf_ptr), count ); @@ -379,7 +379,9 @@ void L2PerfCollector::reconcile_counters() { } if (leftover_active > 0) { - LOG_ERROR("L2Perf reconcile: %d unit(s) had un-cleared current_buf_ptr — see prior errors", leftover_active); + LOG_ERROR( + "L2Swimlane reconcile: %d unit(s) had un-cleared current_buf_ptr — see prior errors", leftover_active + ); } // Cross-check device-side totals against host CSV. PERF and PHASE @@ -391,7 +393,7 @@ void L2PerfCollector::reconcile_counters() { uint64_t dropped_device = 0; uint64_t mismatch_device = 0; for (int i = 0; i < unit_count; i++) { - L2PerfBufferState *state = get_state(i); + L2SwimlaneAicpuTaskPool *state = get_state(i); total_device += state->total_record_count; dropped_device += state->dropped_record_count; mismatch_device += state->mismatch_record_count; @@ -403,14 +405,14 @@ void L2PerfCollector::reconcile_counters() { if (dropped_device > 0) { LOG_WARN( - "L2Perf reconcile: %lu %s records dropped on device side (buffer full / " + "L2Swimlane reconcile: %lu %s records dropped on device side (buffer full / " "ready_queue full / late FIN after flush).", static_cast(dropped_device), kind ); } if (mismatch_device > 0) { LOG_ERROR( - "L2Perf reconcile: %lu %s records lost to AICore staging-slot task_id mismatch — " + "L2Swimlane reconcile: %lu %s records lost to AICore staging-slot task_id mismatch — " "completion-before-dispatch invariant violated", static_cast(mismatch_device), kind ); @@ -418,7 +420,7 @@ void L2PerfCollector::reconcile_counters() { uint64_t accounted = collected + dropped_device + mismatch_device; if (accounted != total_device) { LOG_WARN( - "L2Perf reconcile: %s count mismatch (collected=%lu + dropped=%lu + mismatch=%lu != " + "L2Swimlane reconcile: %s count mismatch (collected=%lu + dropped=%lu + mismatch=%lu != " "device_total=%lu, silent_loss=%ld)", kind, static_cast(collected), static_cast(dropped_device), static_cast(mismatch_device), static_cast(total_device), @@ -426,8 +428,8 @@ void L2PerfCollector::reconcile_counters() { ); } else { LOG_INFO_V0( - "L2Perf reconcile: %s counts match (collected=%lu, dropped=%lu, mismatch=%lu, device_total=%lu)", kind, - static_cast(collected), static_cast(dropped_device), + "L2Swimlane reconcile: %s counts match (collected=%lu, dropped=%lu, mismatch=%lu, device_total=%lu)", + kind, static_cast(collected), static_cast(dropped_device), static_cast(mismatch_device), static_cast(total_device) ); } @@ -450,10 +452,10 @@ void L2PerfCollector::reconcile_counters() { ); } -void L2PerfCollector::read_phase_header_metadata() { +void L2SwimlaneCollector::read_phase_header_metadata() { if (shm_host_ == nullptr) return; - // Pull the AicpuPhaseHeader portion from device (the mgmt loop's final + // Pull the L2SwimlaneAicpuPhaseHeader portion from device (the mgmt loop's final // mirror covered it, but re-mirror to be safe in case stop() raced with // a final write of core_to_thread mapping). if (manager_.shared_mem_dev() != nullptr && shm_size_ > 0) { @@ -461,11 +463,12 @@ void L2PerfCollector::read_phase_header_metadata() { } rmb(); - AicpuPhaseHeader *phase_header = get_phase_header(shm_host_, num_aicore_); + L2SwimlaneAicpuPhaseHeader *phase_header = get_phase_header(shm_host_, num_aicore_); - if (phase_header->magic != AICPU_PHASE_MAGIC) { + if (phase_header->magic != L2_SWIMLANE_AICPU_PHASE_MAGIC) { LOG_INFO_V0( - "No phase profiling data found (magic mismatch: 0x%x vs 0x%x)", phase_header->magic, AICPU_PHASE_MAGIC + "No phase profiling data found (magic mismatch: 0x%x vs 0x%x)", phase_header->magic, + L2_SWIMLANE_AICPU_PHASE_MAGIC ); return; } @@ -511,7 +514,7 @@ void L2PerfCollector::read_phase_header_metadata() { // export_swimlane_json // --------------------------------------------------------------------------- -int L2PerfCollector::export_swimlane_json() { +int L2SwimlaneCollector::export_swimlane_json() { bool has_any_records = false; for (const auto &core_records : collected_perf_records_) { if (!core_records.empty()) { @@ -532,7 +535,7 @@ int L2PerfCollector::export_swimlane_json() { } struct TaggedRecord { - const L2PerfRecord *record; + const L2SwimlaneAicpuTaskRecord *record; uint32_t core_id; }; std::vector tagged_records; @@ -571,7 +574,7 @@ int L2PerfCollector::export_swimlane_json() { } } - std::string filepath = output_prefix_ + "/l2_perf_records.json"; + std::string filepath = output_prefix_ + "/l2_swimlane_records.json"; std::ofstream outfile(filepath); if (!outfile.is_open()) { @@ -579,9 +582,9 @@ int L2PerfCollector::export_swimlane_json() { return -1; } - int l2_perf_level = static_cast(l2_perf_level_); + int l2_swimlane_level = static_cast(l2_swimlane_level_); outfile << "{\n"; - outfile << " \"l2_perf_level\": " << l2_perf_level << ",\n"; + outfile << " \"l2_swimlane_level\": " << l2_swimlane_level << ",\n"; outfile << " \"tasks\": [\n"; for (size_t i = 0; i < tagged_records.size(); ++i) { @@ -627,41 +630,41 @@ int L2PerfCollector::export_swimlane_json() { outfile << " ]"; // Step: Write phase profiling data (level >= 3) - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - auto sched_phase_name = [](AicpuPhaseId id) -> const char * { + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + auto sched_phase_name = [](L2SwimlaneAicpuPhaseId id) -> const char * { switch (id) { - case AicpuPhaseId::SCHED_COMPLETE: + case L2SwimlaneAicpuPhaseId::SCHED_COMPLETE: return "complete"; - case AicpuPhaseId::SCHED_DISPATCH: + case L2SwimlaneAicpuPhaseId::SCHED_DISPATCH: return "dispatch"; - case AicpuPhaseId::SCHED_SCAN: + case L2SwimlaneAicpuPhaseId::SCHED_SCAN: return "scan"; - case AicpuPhaseId::SCHED_IDLE_WAIT: + case L2SwimlaneAicpuPhaseId::SCHED_IDLE_WAIT: return "idle"; default: return "unknown"; } }; - auto orch_phase_name = [](AicpuPhaseId id) -> const char * { + auto orch_phase_name = [](L2SwimlaneAicpuPhaseId id) -> const char * { switch (id) { - case AicpuPhaseId::ORCH_SYNC: + case L2SwimlaneAicpuPhaseId::ORCH_SYNC: return "orch_sync"; - case AicpuPhaseId::ORCH_ALLOC: + case L2SwimlaneAicpuPhaseId::ORCH_ALLOC: return "orch_alloc"; - case AicpuPhaseId::ORCH_PARAMS: + case L2SwimlaneAicpuPhaseId::ORCH_PARAMS: return "orch_params"; - case AicpuPhaseId::ORCH_LOOKUP: + case L2SwimlaneAicpuPhaseId::ORCH_LOOKUP: return "orch_lookup"; - case AicpuPhaseId::ORCH_HEAP: + case L2SwimlaneAicpuPhaseId::ORCH_HEAP: return "orch_heap"; - case AicpuPhaseId::ORCH_INSERT: + case L2SwimlaneAicpuPhaseId::ORCH_INSERT: return "orch_insert"; - case AicpuPhaseId::ORCH_FANIN: + case L2SwimlaneAicpuPhaseId::ORCH_FANIN: return "orch_fanin"; - case AicpuPhaseId::ORCH_FINALIZE: + case L2SwimlaneAicpuPhaseId::ORCH_FINALIZE: return "orch_finalize"; - case AicpuPhaseId::ORCH_SCOPE_END: + case L2SwimlaneAicpuPhaseId::ORCH_SCOPE_END: return "orch_scope_end"; default: return "unknown"; @@ -684,7 +687,7 @@ int L2PerfCollector::export_swimlane_json() { // Phase-specific deltas (currently only SCHED_DISPATCH carries // pop_hit / pop_miss). Other phases pass zero extras; omitting // them keeps the JSON terse per record. - if (pr.phase_id == AicpuPhaseId::SCHED_DISPATCH) { + if (pr.phase_id == L2SwimlaneAicpuPhaseId::SCHED_DISPATCH) { outfile << ", \"pop_hit\": " << pr.extra1 << ", \"pop_miss\": " << pr.extra2; } outfile << "}"; @@ -699,12 +702,12 @@ int L2PerfCollector::export_swimlane_json() { // Per-task orchestrator phase records (level >= 4, filtered from unified collected_phase_records_) // Orchestrator timing is no longer emitted as a separate aggregate - // block. Per-event AicpuPhaseRecord[] entries (emitted as + // block. Per-event L2SwimlaneAicpuPhaseRecord[] entries (emitted as // aicpu_orchestrator_phases below) are the single source of truth; // the run-window envelope is still visible in the device-side // LOG_INFO_V9 "Thread N: orch_start=… orch_end=… orch_cost=…" line. bool has_orch_phases = false; - if (l2_perf_level_ >= L2PerfLevel::ORCH_PHASES) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) { for (const auto &v : collected_phase_records_) { for (const auto &r : v) { if (!is_scheduler_phase(r.phase_id)) { @@ -764,7 +767,7 @@ int L2PerfCollector::export_swimlane_json() { // finalize // --------------------------------------------------------------------------- -int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2PerfFreeCallback &free_cb) { +int L2SwimlaneCollector::finalize(L2SwimlaneUnregisterCallback unregister_cb, const L2SwimlaneFreeCallback &free_cb) { if (shm_host_ == nullptr) return 0; // Stop mgmt + collector threads if the caller didn't already (idempotent). @@ -782,7 +785,7 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe // shadow stays in dev_to_host_ and is freed by clear_mappings() below // (single source of truth for shadow lifetime, no double-free risk). for (int i = 0; i < num_aicore_; i++) { - L2PerfBufferState *state = get_perf_buffer_state(shm_host_, i); + L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm_host_, i); release_dev(reinterpret_cast(state->current_buf_ptr)); state->current_buf_ptr = 0; @@ -804,7 +807,7 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe int num_phase_threads = PLATFORM_MAX_AICPU_THREADS; for (int t = 0; t < num_phase_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(shm_host_, num_aicore_, t); + L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm_host_, num_aicore_, t); release_dev(reinterpret_cast(state->current_buf_ptr)); state->current_buf_ptr = 0; @@ -831,7 +834,7 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe release_dev(p); }); - // Free per-core L2PerfAicoreRings (no host shadow paired). The rings + // Free per-core L2SwimlaneAicoreRings (no host shadow paired). The rings // were allocated directly via alloc_cb (not alloc_single_buffer), so no // entry exists in dev_to_host_ for them. for (auto *ring_dev : aicore_rings_dev_) { diff --git a/src/a5/platform/src/host/pmu_collector.cpp b/src/a5/platform/src/host/pmu_collector.cpp index 1468afa01..e6af94941 100644 --- a/src/a5/platform/src/host/pmu_collector.cpp +++ b/src/a5/platform/src/host/pmu_collector.cpp @@ -317,7 +317,7 @@ void PmuCollector::reconcile_counters() { // Cross-check device-side totals against host CSV. PMU is single-kind // (one per-core pool), so reconcile_one is invoked once; the lambda - // shape matches L2PerfCollector::reconcile_counters so the two + // shape matches L2SwimlaneCollector::reconcile_counters so the two // single-arch implementations stay diff-able. auto reconcile_one = [&](int unit_count, auto get_state, uint64_t collected, bool optional) { uint64_t total_device = 0; diff --git a/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp index 583dc1da7..32e1ff714 100644 --- a/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp +++ b/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp @@ -11,9 +11,9 @@ #include "aicore/aicore.h" #include "aicore/aicore_profiling_state.h" -#include "aicore/l2_perf_collector_aicore.h" +#include "aicore/l2_swimlane_collector_aicore.h" #include "aicore/pmu_collector_aicore.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" // Platform configuration (C/C++ compatible) #include "common/pmu_profiling.h" #include "runtime.h" @@ -60,10 +60,10 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in // AICore kernel entry from KernelArgs::regs[physical_core_id]), so // they are safe to cache here. uint32_t profiling_flag = get_aicore_profiling_flag(); - bool l2_perf_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE); + bool l2_swimlane_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE); bool dump_tensor_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_DUMP_TENSOR); bool pmu_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_PMU); - __gm__ L2PerfAicoreRing *l2_perf_ring = l2_perf_enabled ? get_aicore_l2_perf_ring() : nullptr; + __gm__ L2SwimlaneAicoreRing *l2_swimlane_ring = l2_swimlane_enabled ? get_aicore_l2_swimlane_ring() : nullptr; __gm__ PmuAicoreRing *pmu_ring = pmu_enabled ? get_aicore_pmu_ring() : nullptr; uint64_t pmu_reg_base = pmu_enabled ? get_aicore_pmu_reg_base() : 0; @@ -105,9 +105,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in pipe_barrier(PIPE_ALL); } - if (l2_perf_enabled) { + if (l2_swimlane_enabled) { uint64_t end_time = get_sys_cnt_aicore(); - l2_perf_aicore_record_task(l2_perf_ring, actual_task_id, start_time, end_time); + l2_swimlane_aicore_record_task(l2_swimlane_ring, actual_task_id, start_time, end_time); } last_task_id = task_id; diff --git a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp index a073cd7c2..729944ea7 100644 --- a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -16,13 +16,13 @@ #include "aicpu/device_log.h" #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/pmu_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" #include "aicpu/platform_regs.h" #include "callable.h" #include "common/memory_barrier.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "common/unified_log.h" #include "runtime.h" @@ -141,7 +141,7 @@ struct AicpuExecutor { inline bool try_dispatch_task( int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head, - int &ready_count, bool l2_perf_enabled + int &ready_count, bool l2_swimlane_enabled ); }; @@ -243,7 +243,7 @@ inline void AicpuExecutor::resolve_task_dependencies( // Try to dispatch a task from thread-local queue to a core inline bool AicpuExecutor::try_dispatch_task( int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head, int &ready_count, - bool l2_perf_enabled + bool l2_swimlane_enabled ) { if (ready_count <= 0) { return false; @@ -286,7 +286,7 @@ inline bool AicpuExecutor::try_dispatch_task( pending_task_ids_[core_id] = task_id; // Record the real AICPU dispatch point for this core. - if (l2_perf_enabled && get_l2_perf_level() >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_enabled && get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } @@ -359,7 +359,7 @@ int AicpuExecutor::init(Runtime *runtime) { dispatch_timestamps_[i] = 0; } if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_init(runtime->worker_count); + l2_swimlane_aicpu_init(runtime->worker_count); } #if PTO2_PROFILING if (is_dump_tensor_enabled()) { @@ -681,8 +681,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const int verification_warning_count = 0; const int MAX_VERIFICATION_WARNINGS = 10; - bool l2_perf_enabled = is_l2_swimlane_enabled(); - L2PerfLevel l2_perf_level = get_l2_perf_level(); + bool l2_swimlane_enabled = is_l2_swimlane_enabled(); + L2SwimlaneLevel l2_swimlane_level = get_l2_swimlane_level(); // Extract array pointers as local variables for better readability and performance int *cur_ready_queue_aic = cur_ready_queue_aic_[thread_idx]; @@ -704,7 +704,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const ); // Initialize dispatch timestamps for all cores (only needed at level >= 2) - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { uint64_t dispatch_start_time = get_sys_cnt_aicpu(); for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; @@ -741,54 +741,54 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const // Profiling: when prev_running_id exists, its AICore timing was // written to wip[id & 1] first, so complete it BEFORE the // pending task's record to maintain buffer ordering. - if (l2_perf_enabled) { - uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; + if (l2_swimlane_enabled) { + uint64_t finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; if (prev_running_id != AICPU_TASK_INVALID) { Task *prev_task = &runtime.tasks[prev_running_id]; uint64_t fanout_arr[RUNTIME_MAX_FANOUT]; int fanout_count = 0; - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { for (int i = 0; i < prev_task->fanout_count; i++) { fanout_arr[i] = static_cast(prev_task->fanout[i]); } fanout_count = prev_task->fanout_count; } - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(prev_running_id), static_cast(prev_running_id), prev_task->func_id, h->core_type, dispatch_timestamps_[core_id], finish_ts, fanout_arr, fanout_count ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for implicit task %d", core_id, + "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id, prev_running_id ); } - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } - finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; + finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; Task *task = &runtime.tasks[completed_task_id]; uint64_t fanout_arr[RUNTIME_MAX_FANOUT]; int fanout_count = 0; - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { for (int i = 0; i < task->fanout_count; i++) { fanout_arr[i] = static_cast(task->fanout[i]); } fanout_count = task->fanout_count; } - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(completed_task_id), static_cast(completed_task_id), task->func_id, h->core_type, dispatch_timestamps_[core_id], finish_ts, fanout_arr, fanout_count ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for task %d", core_id, completed_task_id + "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id ); } - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } @@ -805,12 +805,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) { dispatched = try_dispatch_task( core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head, - cur_aic_ready_count, l2_perf_enabled + cur_aic_ready_count, l2_swimlane_enabled ); } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) { dispatched = try_dispatch_task( core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head, - cur_aiv_ready_count, l2_perf_enabled + cur_aiv_ready_count, l2_swimlane_enabled ); } @@ -842,7 +842,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const made_progress = true; // Update timestamp if didn't dispatch (try_dispatch_task updates it if dispatched) - if (!dispatched && l2_perf_enabled && l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (!dispatched && l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) { @@ -864,28 +864,29 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const // Count it here to avoid losing completion. if (prev_running_id != AICPU_TASK_INVALID) { // Profiling: complete the implicit task's AICore record - if (l2_perf_enabled) { - uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; + if (l2_swimlane_enabled) { + uint64_t finish_ts = + (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; Task *prev_task = &runtime.tasks[prev_running_id]; uint64_t fanout_arr[RUNTIME_MAX_FANOUT]; int fanout_count = 0; - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { for (int i = 0; i < prev_task->fanout_count; i++) { fanout_arr[i] = static_cast(prev_task->fanout[i]); } fanout_count = prev_task->fanout_count; } - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(prev_running_id), static_cast(prev_running_id), prev_task->func_id, h->core_type, dispatch_timestamps_[core_id], finish_ts, fanout_arr, fanout_count ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for implicit task %d", core_id, + "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id, prev_running_id ); } - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } @@ -915,27 +916,27 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const int completed_task_id = running_task_ids_[core_id]; - if (l2_perf_enabled) { - uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; + if (l2_swimlane_enabled) { + uint64_t finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0; Task *task = &runtime.tasks[completed_task_id]; uint64_t fanout_arr[RUNTIME_MAX_FANOUT]; int fanout_count = 0; - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { for (int i = 0; i < task->fanout_count; i++) { fanout_arr[i] = static_cast(task->fanout[i]); } fanout_count = task->fanout_count; } - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(completed_task_id), static_cast(completed_task_id), task->func_id, h->core_type, dispatch_timestamps_[core_id], finish_ts, fanout_arr, fanout_count ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for task %d", core_id, completed_task_id + "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id ); } - if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } @@ -950,12 +951,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) { dispatched = try_dispatch_task( core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head, - cur_aic_ready_count, l2_perf_enabled + cur_aic_ready_count, l2_swimlane_enabled ); } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) { dispatched = try_dispatch_task( core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head, - cur_aiv_ready_count, l2_perf_enabled + cur_aiv_ready_count, l2_swimlane_enabled ); } } @@ -969,7 +970,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const made_progress = true; // Update timestamp if didn't dispatch (try_dispatch_task updates it if dispatched) - if (!dispatched && l2_perf_enabled && l2_perf_level >= L2PerfLevel::AICPU_TIMING) { + if (!dispatched && l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } @@ -979,14 +980,14 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) { if (try_dispatch_task( core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head, - cur_aic_ready_count, l2_perf_enabled + cur_aic_ready_count, l2_swimlane_enabled )) { made_progress = true; } } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) { if (try_dispatch_task( core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head, - cur_aiv_ready_count, l2_perf_enabled + cur_aiv_ready_count, l2_swimlane_enabled )) { made_progress = true; } @@ -1125,7 +1126,7 @@ int AicpuExecutor::run(Runtime *runtime) { // Flush performance buffers for cores managed by this thread. if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_flush_buffers(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]); + l2_swimlane_aicpu_flush(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]); } #if PTO2_PROFILING if (is_pmu_enabled()) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index bb4a98e91..4583175d0 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -11,9 +11,9 @@ #include "aicore/aicore.h" #include "aicore/aicore_profiling_state.h" -#include "aicore/l2_perf_collector_aicore.h" +#include "aicore/l2_swimlane_collector_aicore.h" #include "aicore/pmu_collector_aicore.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" // Register-based communication #include "common/pmu_profiling.h" #include "pto2_dispatch_payload.h" @@ -98,10 +98,10 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in // AICore kernel entry from KernelArgs::regs[physical_core_id]), so // they are safe to cache here. uint32_t profiling_flag = get_aicore_profiling_flag(); - bool l2_perf_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE); + bool l2_swimlane_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE); bool dump_tensor_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_DUMP_TENSOR); bool pmu_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_PMU); - __gm__ L2PerfAicoreRing *l2_perf_ring = l2_perf_enabled ? get_aicore_l2_perf_ring() : nullptr; + __gm__ L2SwimlaneAicoreRing *l2_swimlane_ring = l2_swimlane_enabled ? get_aicore_l2_swimlane_ring() : nullptr; __gm__ PmuAicoreRing *pmu_ring = pmu_enabled ? get_aicore_pmu_ring() : nullptr; uint64_t pmu_reg_base = pmu_enabled ? get_aicore_pmu_reg_base() : 0; @@ -155,9 +155,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in } // Performance profiling: record task execution - if (l2_perf_enabled) { + if (l2_swimlane_enabled) { uint64_t end_time = get_sys_cnt_aicore(); - l2_perf_aicore_record_task(l2_perf_ring, task_id, start_time, end_time); + l2_swimlane_aicore_record_task(l2_swimlane_ring, task_id, start_time, end_time); } last_reg_val = reg_val; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 92dd7db82..8a8a88816 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -35,10 +35,10 @@ #include "pto_shared_memory.h" // Performance profiling headers -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/scope_stats_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/unified_log.h" // Register-based communication @@ -523,7 +523,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count()); #if PTO2_PROFILING - rt->orchestrator.l2_perf_level = get_l2_perf_level(); + rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level(); { auto &orch = rt->orchestrator; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { @@ -549,8 +549,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) { sched_ctx_.wait_init_complete(); #if PTO2_PROFILING - if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { - l2_perf_aicpu_set_orch_thread_idx(thread_idx); + if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { + l2_swimlane_aicpu_set_orch_thread_idx(thread_idx); } // scope_stats streams scope_end records off the orchestrator thread: // record the per-thread ready_queue index. No-op (writer shared @@ -648,7 +648,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line // below carries the same envelope info for debugging, and // host-side swimlane derives per-phase timing from the per-event - // AicpuPhaseRecord[] stream that already covers everything inside + // L2SwimlaneAicpuPhaseRecord[] stream that already covers everything inside // submit_task(). int32_t total_tasks = 0; if (rt->orchestrator.sm_header) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index 79ae71b24..d8ec5f736 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -166,8 +166,8 @@ Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX ``` Per-thread fanout / fanin edge counts and ready-queue pop hit / miss -stats live in `aicpu_scheduler_phases[]` (in `l2_perf_records.json` -captured at l2_perf_level >= 3) and `deps.json`; consume them via +stats live in `aicpu_scheduler_phases[]` (in `l2_swimlane_records.json` +captured at l2_swimlane_level >= 3) and `deps.json`; consume them via `simpler_setup/tools/sched_overhead_analysis.py`. --- @@ -241,10 +241,10 @@ mirrors the PMU pattern — two independent channels (one binary, one int): (`PROFILING_FLAG_L2_SWIMLANE`). Set by the host whenever level > 0; read by AICore (which only needs on/off to decide whether to write timing) and by AICPU kernel entry via `set_l2_swimlane_enabled(bool)`. -- **Granular level (0–4)** — `L2PerfDataHeader::l2_perf_level` - (shared memory). Host writes it in `L2PerfCollector::initialize`; AICPU - promotes it from the header in `l2_perf_aicpu_init` and exposes it via - `get_l2_perf_level()` (typed `L2PerfLevel`) for +- **Granular level (0–4)** — `L2SwimlaneDataHeader::l2_swimlane_level` + (shared memory). Host writes it in `L2SwimlaneCollector::initialize`; AICPU + promotes it from the header in `l2_swimlane_aicpu_init` and exposes it via + `get_l2_swimlane_level()` (typed `L2SwimlaneLevel`) for `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates. On sim, the binary on/off travels via the dlsym'd `set_l2_swimlane_enabled` @@ -263,7 +263,7 @@ Bare `--enable-l2-swimlane` = level 4 (backward compatible). ### Level gating in AICPU code -Use the strongly-typed `L2PerfLevel` enum so each gate names the +Use the strongly-typed `L2SwimlaneLevel` enum so each gate names the content it depends on instead of relying on magic numbers: ```cpp @@ -272,19 +272,19 @@ content it depends on instead of relying on magic numbers: if (is_l2_swimlane_enabled()) { ... } // AICPU dispatch/finish timestamps + fanout. -// Granular checks below require l2_perf_aicpu_init to have already run +// Granular checks below require l2_swimlane_aicpu_init to have already run // (so the level has been promoted from the shared-memory header). -if (get_l2_perf_level() >= L2PerfLevel::AICPU_TIMING) { ... } +if (get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { ... } // Scheduler main-loop phase records (SCHED_*) -if (get_l2_perf_level() >= L2PerfLevel::SCHED_PHASES) { ... } +if (get_l2_swimlane_level() >= L2SwimlaneLevel::SCHED_PHASES) { ... } // Orchestrator phase records -if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { ... } +if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { ... } ``` -`L2PerfLevel` is defined in `common/l2_perf_profiling.h` with -underlying type `uint32_t` (matches the `L2PerfDataHeader::l2_perf_level` +`L2SwimlaneLevel` is defined in `common/l2_swimlane_profiling.h` with +underlying type `uint32_t` (matches the `L2SwimlaneDataHeader::l2_swimlane_level` shared-memory field and mirrors `PmuEventType : uint32_t`): | Enumerator | Underlying value | diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index db06248e6..b2efb224e 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -52,7 +52,7 @@ extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabl // ============================================================================= #if PTO2_ORCH_PROFILING #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" // Weak fallback for builds that don't link device_time.cpp (e.g. host). // The strong symbol from platform/.../device_time.cpp wins in the AICPU build. // @@ -65,11 +65,11 @@ extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabl // so the AICPU .so's PLT resolves to its own strong definition from // device_time.cpp. __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } -// Weak fallback for builds that don't link l2_perf_collector_aicpu.cpp. +// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp. // The strong symbol from the AICPU build wins when profiling is available. // Also hidden to prevent HOST .so from polluting the global symbol table. __attribute__((weak, visibility("hidden"))) void -l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} +l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} // Accumulated cycles per sub-step (only needed for ORCH_PROFILING export) static uint64_t g_orch_sync_cycle = 0; // tensormap sync static uint64_t g_orch_alloc_cycle = 0; // unified task+heap alloc @@ -87,11 +87,11 @@ uint64_t g_orch_args_atomic_count = 0; uint64_t g_orch_scope_end_atomic_count = 0; // Cycle accumulation is unconditional under PTO2_ORCH_PROFILING (that's what // the flag is for). Swim-lane recording is an opt-in add-on gated at runtime -// by l2_perf_level so callers can collect totals without paying GM-store cost. +// by l2_swimlane_level so callers can collect totals without paying GM-store cost. // When the swim-lane write fires, _t0 is re-sampled from the counter *after* // the write so its cost is not attributed to the next phase's accumulator. -#define CYCLE_COUNT_START() \ - bool _prof_active = (orch->l2_perf_level >= L2PerfLevel::ORCH_PHASES); \ +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \ uint64_t _t0 = get_sys_cnt_aicpu(), _t1 #define CYCLE_COUNT_LAP(acc) \ do { \ @@ -99,38 +99,38 @@ uint64_t g_orch_scope_end_atomic_count = 0; acc += (_t1 - _t0); \ _t0 = _t1; \ } while (0) -#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid) \ - do { \ - _t1 = get_sys_cnt_aicpu(); \ - acc += (_t1 - _t0); \ - if (_prof_active) { \ - l2_perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \ - _t0 = get_sys_cnt_aicpu(); \ - } else { \ - _t0 = _t1; \ - } \ +#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid) \ + do { \ + _t1 = get_sys_cnt_aicpu(); \ + acc += (_t1 - _t0); \ + if (_prof_active) { \ + l2_swimlane_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \ + _t0 = get_sys_cnt_aicpu(); \ + } else { \ + _t0 = _t1; \ + } \ } while (0) #elif PTO2_PROFILING #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } __attribute__((weak, visibility("hidden"))) void -l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} +l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} // submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level) static uint32_t g_orch_submit_idx = 0; -#define CYCLE_COUNT_START() \ - bool _prof_active = (orch->l2_perf_level >= L2PerfLevel::ORCH_PHASES); \ +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \ uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0 #define CYCLE_COUNT_LAP(acc) \ do { \ } while (0) -#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid) \ - do { \ - if (_prof_active) { \ - _t1 = get_sys_cnt_aicpu(); \ - l2_perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \ - _t0 = _t1; \ - } \ +#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid) \ + do { \ + if (_prof_active) { \ + _t1 = get_sys_cnt_aicpu(); \ + l2_swimlane_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \ + _t0 = _t1; \ + } \ } while (0) #else #define CYCLE_COUNT_START() @@ -469,7 +469,7 @@ void PTO2OrchestratorState::end_scope() { #if PTO2_ORCH_PROFILING uint64_t _se1 = get_sys_cnt_aicpu(); g_orch_scope_end_cycle += (_se1 - _se0); - // l2_perf_aicpu_record_orch_phase(AicpuPhaseId::ORCH_SCOPE_END, _se0, _se1, g_orch_submit_idx, -1); + // l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId::ORCH_SCOPE_END, _se0, _se1, g_orch_submit_idx, -1); #endif } @@ -504,7 +504,7 @@ static TaskOutputTensors submit_task_common( PTO2FaninBuilder fanin_builder(orch->rings[ring_id].fanin_pool); - CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, task_id.raw); + CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, L2SwimlaneAicpuPhaseId::ORCH_ALLOC, task_id.raw); #if PTO2_PROFILING if (layout.total_output_size > 0) { @@ -519,7 +519,7 @@ static TaskOutputTensors submit_task_common( orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive); - CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, task_id.raw); + CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, L2SwimlaneAicpuPhaseId::ORCH_SYNC, task_id.raw); for (uint32_t i = 0; i < args.explicit_dep_count(); i++) { PTO2TaskId dep_task_id = args.explicit_dep(i); @@ -557,12 +557,12 @@ static TaskOutputTensors submit_task_common( return result; } - CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, AicpuPhaseId::ORCH_LOOKUP, task_id.raw); + CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, L2SwimlaneAicpuPhaseId::ORCH_LOOKUP, task_id.raw); // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) === register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope()); - CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, task_id.raw); + CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, L2SwimlaneAicpuPhaseId::ORCH_INSERT, task_id.raw); // === STEP 5: Batch-write to GM (single cache line burst) === // Deferred from allocation phase to avoid scattered GM writes that get @@ -603,7 +603,7 @@ static TaskOutputTensors submit_task_common( } #endif - CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw); + CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, L2SwimlaneAicpuPhaseId::ORCH_PARAMS, task_id.raw); #if PTO2_ORCH_PROFILING g_orch_args_atomic_count += 2; // fanout_lock.store + fanout_count.store #endif @@ -617,7 +617,7 @@ static TaskOutputTensors submit_task_common( SPIN_WAIT_HINT(); } - CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, task_id.raw); + CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, L2SwimlaneAicpuPhaseId::ORCH_FANIN, task_id.raw); #if PTO2_PROFILING orch->tasks_submitted++; @@ -766,7 +766,7 @@ TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const Arg &args) { PTO2TaskDescriptor &task = *prepared.task; PTO2TaskPayload &payload = *prepared.payload; - CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, prepared.task_id.raw); + CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, L2SwimlaneAicpuPhaseId::ORCH_ALLOC, prepared.task_id.raw); #if PTO2_PROFILING if (layout.total_output_size > 0) { @@ -788,7 +788,7 @@ TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const Arg &args) { payload.fanin_actual_count = 0; payload.fanin_spill_start = 0; payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool; - CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, prepared.task_id.raw); + CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, L2SwimlaneAicpuPhaseId::ORCH_PARAMS, prepared.task_id.raw); if (prepared.slot_state != nullptr) { // Hidden alloc tasks complete inline in the orchestrator before any @@ -803,7 +803,7 @@ TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const Arg &args) { } orch->inline_completed_tasks++; - CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, prepared.task_id.raw); + CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, L2SwimlaneAicpuPhaseId::ORCH_FANIN, prepared.task_id.raw); #if PTO2_PROFILING orch->tasks_submitted++; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 9a73714c0..bfd4e7b30 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -28,7 +28,7 @@ #pragma once #include "device_arena.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "pto_ring_buffer.h" #include "pto_runtime2_types.h" #include "pto_submit_types.h" @@ -92,8 +92,8 @@ struct PTO2OrchestratorState { int32_t total_cluster_count{0}; // AIC cores = MIX clusters int32_t total_aiv_count{0}; // AIV cores (= 2 × clusters on standard hardware) #if PTO2_PROFILING - // L2 perf_level copied from get_l2_perf_level(). - L2PerfLevel l2_perf_level{L2PerfLevel::DISABLED}; + // L2 swimlane_level copied from get_l2_swimlane_level(). + L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED}; #endif // === GM HEAP (for output buffers) === diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 6d0849e46..4db9245e5 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -15,11 +15,11 @@ #include "common/unified_log.h" #include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/platform_regs.h" #include "aicpu/pmu_collector_aicpu.h" #include "common/memory_barrier.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/platform_config.h" #include "pto_runtime2.h" #include "pto_shared_memory.h" @@ -377,30 +377,33 @@ int32_t SchedulerContext::handle_timeout_exit( } #if PTO2_PROFILING -void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_thread_completed) { - auto &l2_perf = sched_l2_perf_[thread_idx]; +void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) { + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; uint64_t sched_end_ts = get_sys_cnt_aicpu(); LOG_INFO_V9( "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx, - static_cast(l2_perf.sched_start_ts), static_cast(sched_end_ts), - cycles_to_us(sched_end_ts - l2_perf.sched_start_ts) + static_cast(l2_swimlane.sched_start_ts), static_cast(sched_end_ts), + cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts) ); - uint64_t sched_total = l2_perf.sched_wiring_cycle + l2_perf.sched_complete_cycle + l2_perf.sched_scan_cycle + - l2_perf.sched_dispatch_cycle + l2_perf.sched_idle_cycle; + uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle + + l2_swimlane.sched_scan_cycle + l2_swimlane.sched_dispatch_cycle + + l2_swimlane.sched_idle_cycle; if (sched_total == 0) sched_total = 1; #if PTO2_SCHED_PROFILING { PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx); uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle; - uint64_t complete_poll = (l2_perf.sched_complete_cycle > otc_total + l2_perf.sched_complete_perf_cycle) ? - (l2_perf.sched_complete_cycle - otc_total - l2_perf.sched_complete_perf_cycle) : - 0; - uint64_t dispatch_poll = - (l2_perf.sched_dispatch_cycle > l2_perf.sched_dispatch_pop_cycle + l2_perf.sched_dispatch_setup_cycle) ? - (l2_perf.sched_dispatch_cycle - l2_perf.sched_dispatch_pop_cycle - l2_perf.sched_dispatch_setup_cycle) : + uint64_t complete_poll = + (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ? + (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) : 0; + uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle > + l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ? + (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle - + l2_swimlane.sched_dispatch_setup_cycle) : + 0; LOG_INFO_V9( "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx, @@ -411,20 +414,21 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges // × core_to_thread). LOG_INFO_V9( - "Thread %d: complete : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_complete_cycle), - l2_perf.sched_complete_cycle * 100.0 / sched_total + "Thread %d: complete : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle), + l2_swimlane.sched_complete_cycle * 100.0 / sched_total ); - uint64_t c_parent = l2_perf.sched_complete_cycle > 0 ? l2_perf.sched_complete_cycle : 1; - uint64_t complete_miss_count = (l2_perf.complete_probe_count > l2_perf.complete_hit_count) ? - (l2_perf.complete_probe_count - l2_perf.complete_hit_count) : + uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1; + uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ? + (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) : 0; - double complete_hit_rate = - l2_perf.complete_probe_count > 0 ? l2_perf.complete_hit_count * 100.0 / l2_perf.complete_probe_count : 0.0; + double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ? + l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count : + 0.0; LOG_INFO_V9( "Thread %d: poll : %.3fus (%.1f%%) hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%", thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent, - static_cast(l2_perf.complete_hit_count), static_cast(complete_miss_count), + static_cast(l2_swimlane.complete_hit_count), static_cast(complete_miss_count), complete_hit_rate ); LOG_INFO_V9( @@ -451,7 +455,8 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa ); LOG_INFO_V9( "Thread %d: perf : %.3fus (%.1f%%)", thread_idx, - cycles_to_us(l2_perf.sched_complete_perf_cycle), l2_perf.sched_complete_perf_cycle * 100.0 / c_parent + cycles_to_us(l2_swimlane.sched_complete_perf_cycle), + l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent ); // pop_hit / pop_miss per-emit deltas live in each dispatch-phase @@ -459,70 +464,72 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa // the run-cumulative tracked in this struct (final-drain emit covers // the trailing-idle tail). LOG_INFO_V9( - "Thread %d: dispatch : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_dispatch_cycle), - l2_perf.sched_dispatch_cycle * 100.0 / sched_total + "Thread %d: dispatch : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle), + l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total ); - uint64_t global_dispatch_count = l2_perf.pop_hit - l2_perf.local_dispatch_count; - uint64_t total_dispatched = l2_perf.local_dispatch_count + global_dispatch_count; - double local_hit_rate = total_dispatched > 0 ? l2_perf.local_dispatch_count * 100.0 / total_dispatched : 0.0; + uint64_t global_dispatch_count = l2_swimlane.pop_hit - l2_swimlane.local_dispatch_count; + uint64_t total_dispatched = l2_swimlane.local_dispatch_count + global_dispatch_count; + double local_hit_rate = + total_dispatched > 0 ? l2_swimlane.local_dispatch_count * 100.0 / total_dispatched : 0.0; LOG_INFO_V9( "Thread %d: local_disp : local=%" PRIu64 ", global=%" PRIu64 ", overflow=%" PRIu64 ", local_rate=%.1f%%", - thread_idx, static_cast(l2_perf.local_dispatch_count), - static_cast(global_dispatch_count), static_cast(l2_perf.local_overflow_count), + thread_idx, static_cast(l2_swimlane.local_dispatch_count), + static_cast(global_dispatch_count), static_cast(l2_swimlane.local_overflow_count), local_hit_rate ); - uint64_t d_parent = l2_perf.sched_dispatch_cycle > 0 ? l2_perf.sched_dispatch_cycle : 1; + uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1; LOG_INFO_V9( "Thread %d: poll : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll), dispatch_poll * 100.0 / d_parent ); LOG_INFO_V9( "Thread %d: pop : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, - cycles_to_us(l2_perf.sched_dispatch_pop_cycle), l2_perf.sched_dispatch_pop_cycle * 100.0 / d_parent, - cycles_to_us(l2_perf.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle), + cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent, + cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle), static_cast(sp.pop_atomic_count) ); LOG_INFO_V9( "Thread %d: setup : %.3fus (%.1f%%)", thread_idx, - cycles_to_us(l2_perf.sched_dispatch_setup_cycle), l2_perf.sched_dispatch_setup_cycle * 100.0 / d_parent + cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle), + l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent ); LOG_INFO_V9( - "Thread %d: scan : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_scan_cycle), - l2_perf.sched_scan_cycle * 100.0 / sched_total + "Thread %d: scan : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_scan_cycle), + l2_swimlane.sched_scan_cycle * 100.0 / sched_total ); #if PTO2_SCHED_PROFILING LOG_INFO_V9( "Thread %d: wiring : %.3fus (%.1f%%) tasks=%d", thread_idx, - cycles_to_us(l2_perf.sched_wiring_cycle), l2_perf.sched_wiring_cycle * 100.0 / sched_total, - l2_perf.phase_wiring_count + cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total, + l2_swimlane.phase_wiring_count ); #else LOG_INFO_V9( - "Thread %d: wiring : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_wiring_cycle), - l2_perf.sched_wiring_cycle * 100.0 / sched_total + "Thread %d: wiring : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle), + l2_swimlane.sched_wiring_cycle * 100.0 / sched_total ); #endif LOG_INFO_V9( - "Thread %d: idle : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_idle_cycle), - l2_perf.sched_idle_cycle * 100.0 / sched_total + "Thread %d: idle : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle), + l2_swimlane.sched_idle_cycle * 100.0 / sched_total ); if (cur_thread_completed > 0) { LOG_INFO_V9( "Thread %d: avg/complete : %.3fus", thread_idx, - cycles_to_us(l2_perf.sched_complete_cycle) / cur_thread_completed + cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed ); } } #endif LOG_INFO_V9( "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx, - cycles_to_us(sched_total), static_cast(l2_perf.sched_loop_count), cur_thread_completed + cycles_to_us(sched_total), static_cast(l2_swimlane.sched_loop_count), cur_thread_completed ); } #endif @@ -837,18 +844,18 @@ int32_t SchedulerContext::init( regs_ = regs_base; #if PTO2_PROFILING - // l2_perf_aicpu_init promotes g_l2_perf_level from the shared-memory + // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory // header — must be called BEFORE the orchestrator thread caches the level - // via rt->orchestrator.l2_perf_level = get_l2_perf_level() in + // via rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level() in // AicpuExecutor::run(). Otherwise the cached value would still be DISABLED // (only the binary enable bit has been seeded by kernel.cpp at this point), // and the CYCLE_COUNT_START() gate in pto_orchestrator.cpp would suppress // all ORCH_PHASES records. if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_init(runtime->worker_count); - l2_perf_level_ = get_l2_perf_level(); - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - l2_perf_aicpu_init_phase(runtime->worker_count, sched_thread_num_); + l2_swimlane_aicpu_init(runtime->worker_count); + l2_swimlane_level_ = get_l2_swimlane_level(); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_thread_num_); } } #endif @@ -973,9 +980,9 @@ void SchedulerContext::on_orchestration_done( Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks ) { #if PTO2_PROFILING - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { // Flush orchestrator's phase record buffer - l2_perf_aicpu_flush_phase_buffers(thread_idx); + l2_swimlane_aicpu_flush_phase_buffers(thread_idx); } #endif @@ -1028,10 +1035,10 @@ void SchedulerContext::on_orchestration_done( // Write core-to-thread mapping AFTER reassignment so the profiling data // reflects the final distribution (all active_sched_threads_, including // former orchestrator threads when orch_to_sched_ is enabled). - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - l2_perf_aicpu_init_core_assignments(cores_total_num_); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_init_core_assignments(cores_total_num_); for (int32_t t = 0; t < active_sched_threads_; t++) { - l2_perf_aicpu_write_core_assignments_for_thread( + l2_swimlane_aicpu_write_core_assignments_for_thread( t, core_trackers_[t].core_ids(), core_trackers_[t].core_num() ); } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp index 687d5f15d..5784f54dc 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp @@ -13,7 +13,7 @@ #include "common/unified_log.h" #include "aicpu/device_time.h" #include "aicpu/platform_regs.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "pto_runtime2.h" @@ -21,7 +21,7 @@ #include "spin_hint.h" // Performance profiling headers -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/pmu_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" @@ -77,7 +77,7 @@ void SchedulerContext::complete_slot_task( #endif ) { #if PTO2_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #else (void)hank; #endif @@ -130,7 +130,7 @@ void SchedulerContext::complete_slot_task( sched_->on_mixed_task_complete(slot_state, local_bufs); #endif #if PTO2_PROFILING - l2_perf.phase_complete_count++; + l2_swimlane.phase_complete_count++; #endif if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) { deferred_release_slot_states[deferred_release_count++] = &slot_state; @@ -151,7 +151,7 @@ void SchedulerContext::complete_slot_task( } #if PTO2_PROFILING - if (l2_perf.l2_perf_enabled) { + if (l2_swimlane.l2_swimlane_enabled) { #if PTO2_SCHED_PROFILING uint64_t t_perf_start = get_sys_cnt_aicpu(); #endif @@ -159,7 +159,7 @@ void SchedulerContext::complete_slot_task( uint64_t fanout_arr[RUNTIME_MAX_FANOUT]; int32_t fanout_n = 0; - if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { finish_ts = get_sys_cnt_aicpu(); PTO2DepListEntry *cur = slot_state.fanout_head; while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) { @@ -169,18 +169,18 @@ void SchedulerContext::complete_slot_task( } int32_t perf_slot_idx = static_cast(subslot); - if (l2_perf_aicpu_complete_record( + if (l2_swimlane_aicpu_complete_task( core_id, thread_idx, static_cast(expected_reg_task_id), slot_state.task->task_id.raw, slot_state.task->kernel_id[perf_slot_idx], hank[core_id].core_type, dispatch_ts, finish_ts, fanout_arr, fanout_n ) != 0) { LOG_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id, + "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id, static_cast(slot_state.task->task_id.raw) ); } #if PTO2_SCHED_PROFILING - l2_perf.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start); + l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start); #endif } #endif @@ -224,7 +224,7 @@ void SchedulerContext::check_running_cores_for_completion( PTO2LocalReadyBuffer *local_bufs ) { #if PTO2_SCHED_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #endif CoreTracker &tracker = core_trackers_[thread_idx]; auto running_core_states = tracker.get_all_running_cores(); @@ -246,8 +246,8 @@ void SchedulerContext::check_running_cores_for_completion( int32_t reg_state = EXTRACT_TASK_STATE(reg_val); #if PTO2_SCHED_PROFILING - if (l2_perf.l2_perf_enabled) { - l2_perf.complete_probe_count++; + if (l2_swimlane.l2_swimlane_enabled) { + l2_swimlane.complete_probe_count++; } #endif @@ -256,8 +256,8 @@ void SchedulerContext::check_running_cores_for_completion( if (!t.matched) continue; #if PTO2_SCHED_PROFILING - if (l2_perf.l2_perf_enabled && (t.running_done || t.pending_done)) { - l2_perf.complete_hit_count++; + if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) { + l2_swimlane.complete_hit_count++; } #endif diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h index 34271baef..0f6edf1aa 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h @@ -11,7 +11,7 @@ #ifndef SCHEDULER_CONTEXT_H #define SCHEDULER_CONTEXT_H -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/unified_log.h" #include "scheduler_types.h" @@ -134,10 +134,10 @@ class SchedulerContext { SyncStartDrainState drain_state_; #if PTO2_PROFILING - SchedL2PerfCounters sched_l2_perf_[MAX_AICPU_THREADS]; - // Cached once at init() from get_l2_perf_level(), AFTER - // l2_perf_aicpu_init has promoted the level from the shared-memory header. - L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; + SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS]; + // Cached once at init() from get_l2_swimlane_level(), AFTER + // l2_swimlane_aicpu_init has promoted the level from the shared-memory header. + L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED}; #endif // --- Task-execution tracking --- @@ -353,7 +353,7 @@ class SchedulerContext { ); #if PTO2_PROFILING - __attribute__((noinline, cold)) void log_l2_perf_summary(int32_t thread_idx, int32_t cur_thread_completed); + __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed); #endif // ========================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index 9ca15c9c1..0a3efe40f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -18,7 +18,7 @@ #include "aicpu/device_time.h" #include "aicpu/platform_regs.h" #include "callable.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "pto_runtime2.h" @@ -26,7 +26,7 @@ #include "spin_hint.h" // Performance profiling headers -#include "aicpu/l2_perf_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" #include "aicpu/pmu_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" @@ -74,15 +74,15 @@ int SchedulerContext::pop_ready_tasks_batch( PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count ) { #if PTO2_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #if PTO2_SCHED_PROFILING extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[]; uint64_t t_pop_start = get_sys_cnt_aicpu(); int count = sched_->get_ready_tasks_batch( shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx], - l2_perf.local_dispatch_count + l2_swimlane.local_dispatch_count ); - l2_perf.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start); + l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start); #else int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count); #endif @@ -90,9 +90,9 @@ int SchedulerContext::pop_ready_tasks_batch( // so dispatch-phase records in aicpu_scheduler_phases[] carry queue-health // stats on default builds. if (count > 0) { - l2_perf.pop_hit += count; + l2_swimlane.pop_hit += count; } else { - l2_perf.pop_miss++; + l2_swimlane.pop_miss++; } #else (void)thread_idx; @@ -155,7 +155,7 @@ void SchedulerContext::dispatch_subtask_to_core( core_exec_state.pending_slot_state = &slot_state; core_exec_state.pending_reg_task_id = static_cast(reg_task_id); #if PTO2_PROFILING - if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { core_exec_state.pending_dispatch_timestamp = get_sys_cnt_aicpu(); } #endif @@ -164,7 +164,7 @@ void SchedulerContext::dispatch_subtask_to_core( core_exec_state.running_slot_state = &slot_state; core_exec_state.running_reg_task_id = static_cast(reg_task_id); #if PTO2_PROFILING - if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) { + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { core_exec_state.running_dispatch_timestamp = get_sys_cnt_aicpu(); } #endif @@ -247,7 +247,7 @@ void SchedulerContext::dispatch_block( ); } #if PTO2_PROFILING - sched_l2_perf_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask()); + sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask()); #endif } @@ -256,7 +256,7 @@ void SchedulerContext::dispatch_shape( PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed ) { #if PTO2_SCHED_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #endif if (entered_drain) return; @@ -324,7 +324,7 @@ void SchedulerContext::dispatch_shape( } made_progress = true; #if PTO2_SCHED_PROFILING - l2_perf.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); + l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); #endif } @@ -353,7 +353,7 @@ void SchedulerContext::dispatch_ready_tasks( const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1]; #if PTO2_SCHED_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; #endif // Note: flush_local_bufs is invoked multiple times per pass (mid-function @@ -367,7 +367,7 @@ void SchedulerContext::dispatch_ready_tasks( for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) { auto &lb = local_bufs[s]; #if PTO2_SCHED_PROFILING - l2_perf.local_overflow_count += lb.count; + l2_swimlane.local_overflow_count += lb.count; #endif if (lb.count > 0) { sched_->ready_queues[s].push_batch(lb.slot_states, lb.count); @@ -481,9 +481,9 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ ); // One-time init: assign perf buffers (one thread does it; others wait). - // l2_perf_aicpu_init / l2_perf_aicpu_init_phase already ran eagerly in + // l2_swimlane_aicpu_init / l2_swimlane_aicpu_init_phase already ran eagerly in // SchedulerContext::init() so the orchestrator thread can read the - // promoted g_l2_perf_level before caching it on rt->orchestrator. Only + // promoted g_l2_swimlane_level before caching it on rt->orchestrator. Only // dump_tensor / pmu init remain dispatch-time because they depend on // handshake-derived core IDs / counts. if (!init_done_.exchange(true, std::memory_order_acq_rel)) { @@ -512,9 +512,9 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ int32_t idle_iterations = 0; int32_t last_progress_count = 0; #if PTO2_PROFILING - auto &l2_perf = sched_l2_perf_[thread_idx]; - l2_perf.reset(); - l2_perf.l2_perf_enabled = (l2_perf_level_ != L2PerfLevel::DISABLED); + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; + l2_swimlane.reset(); + l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED); #endif constexpr int LOCAL_READY_CAP_PER_TYPE = 64; @@ -529,7 +529,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ bool cores_released = false; #if PTO2_PROFILING - l2_perf.sched_start_ts = get_sys_cnt_aicpu(); + l2_swimlane.sched_start_ts = get_sys_cnt_aicpu(); #endif while (true) { @@ -539,7 +539,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ bool made_progress = false; #if PTO2_PROFILING CYCLE_COUNT_START(); - l2_perf.sched_loop_count++; + l2_swimlane.sched_loop_count++; uint64_t _t0_phase = _t0; #endif int32_t task_count = 0; @@ -554,7 +554,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ } #if PTO2_PROFILING - CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle); + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); #endif // Phase 1: Check running cores for completion @@ -616,16 +616,16 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ #if PTO2_PROFILING if (!try_completed) { - CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle); + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); } else { - CYCLE_COUNT_LAP(l2_perf.sched_complete_cycle); - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES && l2_perf.phase_complete_count > 0) { - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, l2_perf.sched_loop_count, - l2_perf.phase_complete_count + CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_complete_count > 0) { + l2_swimlane_aicpu_record_phase( + thread_idx, L2SwimlaneAicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, l2_swimlane.sched_loop_count, + l2_swimlane.phase_complete_count ); _t0_phase = _t1; - l2_perf.phase_complete_count = 0; + l2_swimlane.phase_complete_count = 0; } } #endif @@ -644,12 +644,12 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ if (wired > 0) { made_progress = true; #if PTO2_SCHED_PROFILING - l2_perf.phase_wiring_count += wired; + l2_swimlane.phase_wiring_count += wired; #endif } } #if PTO2_PROFILING - CYCLE_COUNT_LAP(l2_perf.sched_wiring_cycle); + CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle); #endif // Phase 3b: Drain dummy ready queue (thread 0 only). @@ -700,28 +700,28 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ #if PTO2_PROFILING if (!try_pushed) { - CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle); + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); } else { - CYCLE_COUNT_LAP(l2_perf.sched_dispatch_cycle); - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES && l2_perf.phase_dispatch_count > 0) { + CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_dispatch_count > 0) { // Per-emit pop deltas via snapshot diff; the cumulative // pop_hit / pop_miss stay intact for the cold-path log. - uint64_t pop_hit_delta = l2_perf.pop_hit - l2_perf.pop_hit_at_last_emit; - uint64_t pop_miss_delta = l2_perf.pop_miss - l2_perf.pop_miss_at_last_emit; - // AicpuPhaseRecord's extras are uint32 — a delta that overflows means + uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit; + uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit; + // L2SwimlaneAicpuPhaseRecord's extras are uint32 — a delta that overflows means // an emit was missed for ~4 billion pops, which is well outside any // realistic dispatch cadence and silently truncates without this guard. debug_assert(pop_hit_delta < (1ULL << 32)); debug_assert(pop_miss_delta < (1ULL << 32)); - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, l2_perf.sched_loop_count, - l2_perf.phase_dispatch_count, static_cast(pop_hit_delta), + l2_swimlane_aicpu_record_phase( + thread_idx, L2SwimlaneAicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, l2_swimlane.sched_loop_count, + l2_swimlane.phase_dispatch_count, static_cast(pop_hit_delta), static_cast(pop_miss_delta) ); _t0_phase = _t1; - l2_perf.phase_dispatch_count = 0; - l2_perf.pop_hit_at_last_emit = l2_perf.pop_hit; - l2_perf.pop_miss_at_last_emit = l2_perf.pop_miss; + l2_swimlane.phase_dispatch_count = 0; + l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit; + l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss; } } #endif @@ -756,17 +756,17 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ thread_idx, header, runtime, idle_iterations, last_progress_count #if PTO2_PROFILING , - l2_perf.sched_start_ts + l2_swimlane.sched_start_ts #endif ); } else { SPIN_WAIT_HINT(); } #if PTO2_PROFILING - CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle); - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, l2_perf.sched_loop_count, 0 + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_record_phase( + thread_idx, L2SwimlaneAicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, l2_swimlane.sched_loop_count, 0 ); _t0_phase = _t1; } @@ -794,31 +794,31 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // sum(record.pop_*) reconciles with the run-cumulative counter. // Gate on SCHED_PHASES — at lower levels the phase buffer is never // flushed (see below), so writing this record would be wasted work. - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - uint64_t final_pop_hit_delta = l2_perf.pop_hit - l2_perf.pop_hit_at_last_emit; - uint64_t final_pop_miss_delta = l2_perf.pop_miss - l2_perf.pop_miss_at_last_emit; + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit; + uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit; debug_assert(final_pop_hit_delta < (1ULL << 32)); debug_assert(final_pop_miss_delta < (1ULL << 32)); if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) { uint64_t t_now = get_sys_cnt_aicpu(); - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_DISPATCH, t_now, t_now, l2_perf.sched_loop_count, 0, + l2_swimlane_aicpu_record_phase( + thread_idx, L2SwimlaneAicpuPhaseId::SCHED_DISPATCH, t_now, t_now, l2_swimlane.sched_loop_count, 0, static_cast(final_pop_hit_delta), static_cast(final_pop_miss_delta) ); - l2_perf.pop_hit_at_last_emit = l2_perf.pop_hit; - l2_perf.pop_miss_at_last_emit = l2_perf.pop_miss; + l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit; + l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss; } } - log_l2_perf_summary(thread_idx, cur_thread_completed); + log_l2_swimlane_summary(thread_idx, cur_thread_completed); #endif #if PTO2_PROFILING - if (l2_perf.l2_perf_enabled) { - l2_perf_aicpu_flush_buffers( + if (l2_swimlane.l2_swimlane_enabled) { + l2_swimlane_aicpu_flush( thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num() ); - if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) { - l2_perf_aicpu_flush_phase_buffers(thread_idx); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_flush_phase_buffers(thread_idx); } } #endif diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h index fd155307a..27eee8e3b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h @@ -342,8 +342,8 @@ struct SlotTransition { // ============================================================================= #if PTO2_PROFILING -struct alignas(64) SchedL2PerfCounters { - bool l2_perf_enabled{false}; +struct alignas(64) SchedL2SwimlaneCounters { + bool l2_swimlane_enabled{false}; uint64_t sched_start_ts{0}; uint64_t sched_scan_cycle{0}; uint64_t sched_complete_cycle{0}; @@ -371,7 +371,7 @@ struct alignas(64) SchedL2PerfCounters { uint64_t sched_dispatch_pop_cycle{0}; uint64_t sched_dispatch_setup_cycle{0}; #endif - void reset() { *this = SchedL2PerfCounters{}; } + void reset() { *this = SchedL2SwimlaneCounters{}; } }; #endif diff --git a/src/common/platform/onboard/host/device_runner_base.cpp b/src/common/platform/onboard/host/device_runner_base.cpp index 6bab29993..ab9d246e5 100644 --- a/src/common/platform/onboard/host/device_runner_base.cpp +++ b/src/common/platform/onboard/host/device_runner_base.cpp @@ -939,7 +939,7 @@ void DeviceRunnerBase::start_shared_collectors_for_run() { return create_thread(std::move(fn)); }; if (enable_l2_swimlane_) { - l2_perf_collector_.start(thread_factory); + l2_swimlane_collector_.start(thread_factory); } if (enable_dump_tensor_) { dump_collector_.start(thread_factory); @@ -958,10 +958,10 @@ void DeviceRunnerBase::teardown_shared_collectors_after_run() { // Diagnostic exports use the per-task `output_prefix_` directory the user // set on CallConfig (CallConfig::validate() enforces non-empty upstream). if (enable_l2_swimlane_) { - l2_perf_collector_.stop(); - l2_perf_collector_.read_phase_header_metadata(); - l2_perf_collector_.reconcile_counters(); - l2_perf_collector_.export_swimlane_json(); + l2_swimlane_collector_.stop(); + l2_swimlane_collector_.read_phase_header_metadata(); + l2_swimlane_collector_.reconcile_counters(); + l2_swimlane_collector_.export_swimlane_json(); } if (enable_dump_tensor_) { diff --git a/src/common/platform/onboard/host/device_runner_base.h b/src/common/platform/onboard/host/device_runner_base.h index 72daed69d..73e42e5b2 100644 --- a/src/common/platform/onboard/host/device_runner_base.h +++ b/src/common/platform/onboard/host/device_runner_base.h @@ -50,11 +50,11 @@ #include "arg_direction.h" #include "callable.h" -#include "common/l2_perf_profiling.h" +#include "common/l2_swimlane_profiling.h" #include "device_arena.h" #include "device_runner_helpers.h" #include "host/load_aicpu_op.h" -#include "host/l2_perf_collector.h" +#include "host/l2_swimlane_collector.h" #include "host/memory_allocator.h" #include "host/pmu_collector.h" #include "host/scope_stats_collector.h" @@ -378,8 +378,8 @@ class DeviceRunnerBase { * `set_dep_gen_enabled` is a2a3-only and lives on the subclass. */ void set_l2_swimlane_enabled(int level) { - l2_perf_level_ = static_cast(level); - enable_l2_swimlane_ = (l2_perf_level_ != L2PerfLevel::DISABLED); + l2_swimlane_level_ = static_cast(level); + enable_l2_swimlane_ = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED); } void set_dump_tensor_enabled(bool enable) { enable_dump_tensor_ = enable; } void set_pmu_enabled(int enable_pmu) { @@ -390,7 +390,7 @@ class DeviceRunnerBase { /** * Directory under which all diagnostic artifacts - * (l2_perf_records.json / tensor_dump/ / pmu.csv) land. Required + * (l2_swimlane_records.json / tensor_dump/ / pmu.csv) land. Required * (non-empty) when any diagnostic is enabled; `CallConfig::validate()` * enforces this contract upstream. */ @@ -541,7 +541,7 @@ class DeviceRunnerBase { /** * Start collector mgmt + poll threads for the four shared - * diagnostics collectors (`l2_perf_collector_`, `dump_collector_`, + * diagnostics collectors (`l2_swimlane_collector_`, `dump_collector_`, * `pmu_collector_`, `scope_stats_collector_`) that are enabled. * Each `start()` is gated on the corresponding `enable_*_` flag; * disabled collectors are not started. @@ -557,7 +557,7 @@ class DeviceRunnerBase { * Tear down the four shared diagnostics collectors after the launched * kernels have synced. Each block is gated on the corresponding * `enable_*_` flag and does: stop() → reconcile_counters() → - * export step (`l2_perf` writes swimlane JSON via + * export step (`l2_swimlane` writes swimlane JSON via * `read_phase_header_metadata` + `export_swimlane_json`; `dump` * writes dump files; `pmu` has no export step beyond reconcile; * `scope_stats` writes JSONL). @@ -748,7 +748,7 @@ class DeviceRunnerBase { // direct `rtMalloc`/`rtFree`), but the storage and lifetime live // on the base. `DepGenCollector` is a2a3-only and stays on the // a2a3 subclass. - L2PerfCollector l2_perf_collector_; + L2SwimlaneCollector l2_swimlane_collector_; TensorDumpCollector dump_collector_; PmuCollector pmu_collector_; ScopeStatsCollector scope_stats_collector_; @@ -760,9 +760,9 @@ class DeviceRunnerBase { bool enable_dump_tensor_{false}; bool enable_pmu_{false}; bool enable_scope_stats_{false}; - L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() - PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() - std::string output_prefix_{}; // diagnostic artifact root directory + L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() + PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() + std::string output_prefix_{}; // diagnostic artifact root directory }; #endif // SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_BASE_H diff --git a/src/common/task_interface/call_config.h b/src/common/task_interface/call_config.h index 58ca0076b..7926356dc 100644 --- a/src/common/task_interface/call_config.h +++ b/src/common/task_interface/call_config.h @@ -16,7 +16,7 @@ * `enable_dump_tensor`, `enable_pmu`, `enable_dep_gen`, and * `enable_scope_stats`. All five require `output_prefix` because they each * write a sibling artifact into that directory - * (`l2_perf_records.json` / `tensor_dump/` / `pmu.csv` / `deps.json` / + * (`l2_swimlane_records.json` / `tensor_dump/` / `pmu.csv` / `deps.json` / * `scope_stats.json`). * * `block_dim == 0` is a sentinel for "auto" — DeviceRunner resolves it at @@ -35,7 +35,7 @@ * across compilers (sizeof(bool) is implementation-defined). * * `output_prefix` is a NUL-terminated directory path under which all - * diagnostic artifacts (l2_perf_records.json / tensor_dump/ / pmu.csv / + * diagnostic artifacts (l2_swimlane_records.json / tensor_dump/ / pmu.csv / * deps.json / scope_stats.json) are written. The caller is responsible for * filling it whenever any diagnostic flag is enabled — `validate()` enforces * this contract at every submit/run entry point so the runtime never has to diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py index 428e6efbd..4c252128e 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py @@ -20,7 +20,7 @@ implicitly: if it broke, deps.json would be empty or wrong. deps.json is now the sole source of truth for fanout edges — the device -hot path no longer records L2PerfRecord::fanout[], so there is no +hot path no longer records L2SwimlaneAicpuTaskRecord::fanout[], so there is no "fanout ⊆ deps" cross-check to run. swimlane_converter.py joins deps.json into the Perfetto trace at post-process time. diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py index c2b3a18e1..e12c44fcd 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py @@ -48,12 +48,14 @@ def validate_perf_artifact(case_label: str, *, expected_task_count: int | None = matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime) if not matches: return - perf = matches[-1] / "l2_perf_records.json" - assert perf.exists(), f"l2_perf_records.json missing under {matches[-1]} — swimlane capture failed?" + perf = matches[-1] / "l2_swimlane_records.json" + assert perf.exists(), f"l2_swimlane_records.json missing under {matches[-1]} — swimlane capture failed?" with perf.open() as f: data = json.load(f) - assert data.get("l2_perf_level") in (1, 2, 3, 4), f"unexpected l2_perf_level: {data.get('l2_perf_level')}" + assert data.get("l2_swimlane_level") in (1, 2, 3, 4), ( + f"unexpected l2_swimlane_level: {data.get('l2_swimlane_level')}" + ) tasks = data.get("tasks") assert isinstance(tasks, list), "tasks field missing or not a list" assert len(tasks) > 0, f"perf records empty under {perf}" @@ -86,7 +88,7 @@ def validate_perf_artifact(case_label: str, *, expected_task_count: int | None = # ---- Tool smoke: sched_overhead_analysis ---- # pop_hit / pop_miss come from the dispatch-phase extras the runtime writes - # (l2_perf_collector.cpp). The differential block below cross-validates + # (l2_swimlane_collector.cpp). The differential block below cross-validates # the script's printed numbers against an independent oracle computed # straight from the raw artifacts — any regression in either the runtime # capture path or the parser arithmetic fails here in the same CI step @@ -96,7 +98,7 @@ def validate_perf_artifact(case_label: str, *, expected_task_count: int | None = sys.executable, "-m", "simpler_setup.tools.sched_overhead_analysis", - "--l2-perf-records-json", + "--l2-swimlane-records-json", str(perf), ], check=True, @@ -128,7 +130,7 @@ def verify_sched_overhead_differential(stdout: str, perf: dict, artifact_dir: Pa Args: stdout: captured ``sched_overhead_analysis`` stdout. - perf: parsed ``l2_perf_records.json`` dict — passed in by the caller + perf: parsed ``l2_swimlane_records.json`` dict — passed in by the caller so we don't re-read multi-MB profiling artifacts here. artifact_dir: per-case output directory. ``deps.json`` is looked up beside the perf JSON; absent → fanout / fanin half is skipped. diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/test_l2_swimlane.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/test_l2_swimlane.py index ceed3eb7f..56f371b7a 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/test_l2_swimlane.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/test_l2_swimlane.py @@ -8,7 +8,7 @@ # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- """L2 swimlane profiling smoke — capture pipeline produces a usable -``l2_perf_records.json``. +``l2_swimlane_records.json``. Re-uses ``vector_example`` as a known-good 5-task AIV-only workload. When the ``--enable-l2-swimlane`` flag is on, the helper in :mod:`_swimlane_validate` @@ -36,7 +36,7 @@ @scene_test(level=2, runtime="tensormap_and_ringbuffer") class TestL2Swimlane(SceneTestCase): - """Vector example with --enable-l2-swimlane, then assert l2_perf_records.json.""" + """Vector example with --enable-l2-swimlane, then assert l2_swimlane_records.json.""" CALLABLE = { "orchestration": {