From 507ecb3fd812abec6d79ec0846be3e5f9d126904 Mon Sep 17 00:00:00 2001
From: Chao Wang <26245345+ChaoWao@users.noreply.github.com>
Date: Sat, 30 May 2026 17:27:17 +0800
Subject: [PATCH] =?UTF-8?q?Refactor:=20holistic=20L2Perf=20=E2=86=92=20L2S?=
 =?UTF-8?q?wimlane=20rename=20+=20drop=20dead=20fields?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A holistic naming pass over the L2 swimlane subsystem:

  1. Type names get a consistent `L2Swimlane{Writer}{Kind}{Layer}` shape.
  2. Pool struct rename (BufferState → Pool) — captures the actual
     semantic (a pool of buffers, not "a buffer's state").
  3. ReadyQueueEntry::is_phase (uint32_t magic) becomes
     L2SwimlaneBufferKind enum class (uint32_t underlying — no ABI break).
  4. a2a3 only: drop L2SwimlaneAicpuTaskPool::aicore_ring_ptr and
     mismatch_record_count, both dead since #878 / #921.
  5. File names l2_perf_*.{h,cpp} renamed to l2_swimlane_*.{h,cpp}.

== Renaming scheme ==

Records / buffers / pools follow `L2Swimlane{Writer}{Kind}{Layer}`:

  L2PerfRecord            -> L2SwimlaneAicpuTaskRecord
  L2PerfAicoreRecord      -> L2SwimlaneAicoreTaskRecord
  AicpuPhaseRecord        -> L2SwimlaneAicpuPhaseRecord
  L2PerfBuffer            -> L2SwimlaneAicpuTaskBuffer
  L2PerfAicoreBuffer      -> L2SwimlaneAicoreTaskBuffer
  PhaseBuffer             -> L2SwimlaneAicpuPhaseBuffer
  L2PerfBufferState       -> L2SwimlaneAicpuTaskPool
  L2PerfAicoreBufferState -> L2SwimlaneAicoreTaskPool
  PhaseBufferState        -> L2SwimlaneAicpuPhasePool
  L2PerfDataHeader        -> L2SwimlaneDataHeader
  L2PerfLevel             -> L2SwimlaneLevel
  L2PerfFreeQueue         -> L2SwimlaneFreeQueue
  L2PerfModule            -> L2SwimlaneModule
  L2PerfCollector         -> L2SwimlaneCollector
  AicpuPhaseId            -> L2SwimlaneAicpuPhaseId
  AicpuPhaseHeader        -> L2SwimlaneAicpuPhaseHeader
  AicoreRotation          -> L2SwimlaneAicoreRotation
  AicoreLocalState        -> L2SwimlaneAicoreLocalState
  L2PerfAicoreRing  (a5)  -> L2SwimlaneAicoreRing

Enums (uint32_t underlying — wire format preserved):

  enum class L2SwimlaneBufferKind {
      AicpuTask  = 0,
      AicpuPhase = 1,
      AicoreTask = 2,
  };
  enum class ProfBufferType { AICPU_TASK = 0, AICPU_PHASE = 1, AICORE_TASK = 2 };

Functions / globals / statics: l2_perf_* -> l2_swimlane_*,
complete_record -> complete_task, flush_buffers -> flush.

KernelArgs cross-platform fields:
  l2_perf_data_base -> l2_swimlane_base
  aicore_ring_addr  -> l2_swimlane_aicore_rotation_table

Files:
  include/{common,host,aicpu,aicore}/l2_perf_*.{h,cpp}
  -> include/{common,host,aicpu,aicore}/l2_swimlane_*.{h,cpp}
  Output artifact: l2_perf_records.json -> l2_swimlane_records.json

== Dead-field drop (a2a3 only) ==

L2SwimlaneAicpuTaskPool::aicore_ring_ptr        — removed
L2SwimlaneAicpuTaskPool::mismatch_record_count  — removed

Both fields have been dead since:
  - #878 moved AICore writes to a per-core L2SwimlaneAicoreTaskBuffer
    pool with its own state (L2SwimlaneAicoreTaskPool).
  - #921 moved the rotation-table fill from host to AICPU.
  - mismatch_record_count was last written before #878.

Pad adjusted to keep sizeof(L2SwimlaneAicpuTaskPool) == 192 (static_assert
remains green). Host reconcile() loop simplifies to total/dropped only.

a5 keeps both fields — its AICore pipeline still uses the legacy
staging-ring design where they are live.

== Behaviour ==

Pure rename plus the dead-field drop. No struct layout change beyond
removing 12 bytes that nobody read or wrote. uint32_t-underlying enums
preserve wire format.

== Test plan ==

- All four platform variants (a2a3, a2a3sim, a5, a5sim) build clean
- tests/st/.../l2_swimlane pass on a2a3sim
- Same ST passes on a2a3 device 1
- a5sim smoke (spmd_basic) passes
---
 conftest.py                                   |   2 +-
 docs/dfx/dep_gen.md                           |  10 +-
 docs/dfx/l2-swimlane-profiling.md             | 168 +++++-----
 docs/dfx/pmu-profiling.md                     |   2 +-
 docs/dfx/tensor-dump.md                       |   2 +-
 docs/hardware/cache-coherency.md              |  12 +-
 docs/profiling-framework.md                   |  30 +-
 docs/profiling-name-map.md                    |  10 +-
 docs/sim_multi_device_isolation.md            |   2 +-
 docs/testing.md                               |   6 +-
 .../workers/l2/vector_add/test_run_timing.py  |   2 +-
 scope_stats/scope_stats.jsonl                 |   3 +
 simpler_setup/scene_test.py                   |  10 +-
 simpler_setup/tools/README.md                 |  48 +--
 simpler_setup/tools/deps_to_graph.py          |  10 +-
 .../tools/sched_overhead_analysis.py          |  67 ++--
 simpler_setup/tools/swimlane_converter.py     |  53 +--
 .../include/aicore/aicore_profiling_state.h   |  22 +-
 ...icore.h => l2_swimlane_collector_aicore.h} |  39 +--
 .../include/aicpu/dep_gen_collector_aicpu.h   |   2 +-
 ..._aicpu.h => l2_swimlane_collector_aicpu.h} | 104 +++---
 src/a2a3/platform/include/common/dep_gen.h    |   4 +-
 .../platform/include/common/kernel_args.h     |  14 +-
 ...rf_profiling.h => l2_swimlane_profiling.h} | 292 ++++++++---------
 .../platform/include/common/platform_config.h |  12 +-
 .../platform/include/common/pmu_profiling.h   |   8 +-
 .../platform/include/common/scope_stats.h     |   2 +-
 .../platform/include/common/tensor_dump.h     |   2 +-
 .../platform/include/host/dep_gen_collector.h |   2 +-
 ...rf_collector.h => l2_swimlane_collector.h} | 199 ++++++------
 .../profiling_common/buffer_pool_manager.h    |   4 +-
 .../host/profiling_common/profiler_base.h     |  14 +-
 src/a2a3/platform/onboard/aicore/kernel.cpp   |  34 +-
 src/a2a3/platform/onboard/aicpu/kernel.cpp    |   6 +-
 src/a2a3/platform/onboard/host/CMakeLists.txt |   2 +-
 .../platform/onboard/host/device_runner.cpp   |  23 +-
 .../platform/onboard/host/device_runner.h     |  10 +-
 src/a2a3/platform/sim/aicore/inner_kernel.h   |   4 +-
 src/a2a3/platform/sim/aicore/kernel.cpp       |  39 +--
 src/a2a3/platform/sim/host/CMakeLists.txt     |   2 +-
 src/a2a3/platform/sim/host/device_runner.cpp  |  58 ++--
 src/a2a3/platform/sim/host/device_runner.h    |  22 +-
 ...pu.cpp => l2_swimlane_collector_aicpu.cpp} | 304 ++++++++++--------
 ...ollector.cpp => l2_swimlane_collector.cpp} | 230 +++++++------
 .../aicore/aicore_executor.cpp                |  22 +-
 .../host_build_graph/aicpu/aicpu_executor.cpp |  77 ++---
 .../host_build_graph/runtime/runtime.h        |   2 +-
 .../aicore/aicore_executor.cpp                |  32 +-
 .../aicpu/aicpu_executor.cpp                  |  12 +-
 .../docs/profiling_levels.md                  |  26 +-
 .../host/dep_gen_replay.h                     |   2 +-
 .../runtime/pto_orchestrator.cpp              |  54 ++--
 .../runtime/pto_orchestrator.h                |   6 +-
 .../runtime/runtime.h                         |   2 +-
 .../runtime/scheduler/scheduler_cold_path.cpp | 110 ++++---
 .../scheduler/scheduler_completion.cpp        |  28 +-
 .../runtime/scheduler/scheduler_context.h     |  12 +-
 .../runtime/scheduler/scheduler_dispatch.cpp  | 112 +++----
 .../runtime/scheduler/scheduler_types.h       |   6 +-
 .../include/aicore/aicore_profiling_state.h   |  12 +-
 ...icore.h => l2_swimlane_collector_aicore.h} |  27 +-
 ..._aicpu.h => l2_swimlane_collector_aicpu.h} |  82 ++---
 src/a5/platform/include/common/kernel_args.h  |  22 +-
 ...rf_profiling.h => l2_swimlane_profiling.h} | 220 +++++++------
 .../platform/include/common/platform_config.h |  14 +-
 .../platform/include/common/pmu_profiling.h   |   2 +-
 src/a5/platform/include/common/scope_stats.h  |   2 +-
 ...rf_collector.h => l2_swimlane_collector.h} | 151 ++++-----
 .../profiling_common/buffer_pool_manager.h    |  10 +-
 .../host/profiling_common/profiler_base.h     |  18 +-
 src/a5/platform/onboard/aicore/kernel.cpp     |  22 +-
 src/a5/platform/onboard/aicpu/kernel.cpp      |   4 +-
 src/a5/platform/onboard/host/CMakeLists.txt   |   2 +-
 .../platform/onboard/host/device_runner.cpp   |  26 +-
 src/a5/platform/onboard/host/device_runner.h  |  16 +-
 src/a5/platform/sim/aicore/inner_kernel.h     |   4 +-
 src/a5/platform/sim/aicore/kernel.cpp         |  24 +-
 src/a5/platform/sim/host/CMakeLists.txt       |   2 +-
 src/a5/platform/sim/host/device_runner.cpp    |  51 +--
 src/a5/platform/sim/host/device_runner.h      |  22 +-
 ...pu.cpp => l2_swimlane_collector_aicpu.cpp} | 248 +++++++-------
 .../platform/src/aicpu/tensor_dump_aicpu.cpp  |   2 +-
 ...ollector.cpp => l2_swimlane_collector.cpp} | 195 +++++------
 src/a5/platform/src/host/pmu_collector.cpp    |   2 +-
 .../aicore/aicore_executor.cpp                |  12 +-
 .../host_build_graph/aicpu/aicpu_executor.cpp |  83 ++---
 .../aicore/aicore_executor.cpp                |  12 +-
 .../aicpu/aicpu_executor.cpp                  |  12 +-
 .../docs/profiling_levels.md                  |  26 +-
 .../runtime/pto_orchestrator.cpp              |  74 ++---
 .../runtime/pto_orchestrator.h                |   6 +-
 .../runtime/scheduler/scheduler_cold_path.cpp | 117 +++----
 .../scheduler/scheduler_completion.cpp        |  28 +-
 .../runtime/scheduler/scheduler_context.h     |  12 +-
 .../runtime/scheduler/scheduler_dispatch.cpp  | 120 +++----
 .../runtime/scheduler/scheduler_types.h       |   6 +-
 .../onboard/host/device_runner_base.cpp       |  10 +-
 .../onboard/host/device_runner_base.h         |  22 +-
 src/common/task_interface/call_config.h       |   4 +-
 .../dfx/dep_gen/test_dep_gen.py               |   2 +-
 .../dfx/l2_swimlane/_swimlane_validate.py     |  14 +-
 .../dfx/l2_swimlane/test_l2_swimlane.py       |   4 +-
 102 files changed, 2111 insertions(+), 2027 deletions(-)
 create mode 100644 scope_stats/scope_stats.jsonl
 rename src/a2a3/platform/include/aicore/{l2_perf_collector_aicore.h => l2_swimlane_collector_aicore.h} (75%)
 rename src/a2a3/platform/include/aicpu/{l2_perf_collector_aicpu.h => l2_swimlane_collector_aicpu.h} (66%)
 rename src/a2a3/platform/include/common/{l2_perf_profiling.h => l2_swimlane_profiling.h} (68%)
 rename src/a2a3/platform/include/host/{l2_perf_collector.h => l2_swimlane_collector.h} (62%)
 rename src/a2a3/platform/src/aicpu/{l2_perf_collector_aicpu.cpp => l2_swimlane_collector_aicpu.cpp} (68%)
 rename src/a2a3/platform/src/host/{l2_perf_collector.cpp => l2_swimlane_collector.cpp} (82%)
 rename src/a5/platform/include/aicore/{l2_perf_collector_aicore.h => l2_swimlane_collector_aicore.h} (73%)
 rename src/a5/platform/include/aicpu/{l2_perf_collector_aicpu.h => l2_swimlane_collector_aicpu.h} (67%)
 rename src/a5/platform/include/common/{l2_perf_profiling.h => l2_swimlane_profiling.h} (69%)
 rename src/a5/platform/include/host/{l2_perf_collector.h => l2_swimlane_collector.h} (66%)
 rename src/a5/platform/src/aicpu/{l2_perf_collector_aicpu.cpp => l2_swimlane_collector_aicpu.cpp} (64%)
 rename src/a5/platform/src/host/{l2_perf_collector.cpp => l2_swimlane_collector.cpp} (80%)

diff --git a/conftest.py b/conftest.py
index 6385a8f67..4c3cac374 100644
--- a/conftest.py
+++ b/conftest.py
@@ -572,7 +572,7 @@ def sort_key(item):
     items.sort(key=sort_key)
 
     # L3 perf collection is not supported yet: a single L3 case forks N chip-processes
-    # that all write l2_perf_records_<ts>.json to the same directory with
+    # that all write l2_swimlane_records_<ts>.json to the same directory with
     # second-precision timestamps, so they trample each other. Block the
     # combination up front; waiting for a proper device-id-in-filename fix.
     if config.getoption("--enable-l2-swimlane", default=0):
diff --git a/docs/dfx/dep_gen.md b/docs/dfx/dep_gen.md
index c1e83f8d4..1fda52e71 100644
--- a/docs/dfx/dep_gen.md
+++ b/docs/dfx/dep_gen.md
@@ -6,7 +6,7 @@ The swimlane profiler's per-task `fanout[]` array is the obvious place to
 read "which tasks did task X feed into?" — but it is **structurally
 incomplete on real hardware**.
 
-Each producer task carries its own `L2PerfRecord.fanout[RUNTIME_MAX_FANOUT]`,
+Each producer task carries its own `L2SwimlaneAicpuTaskRecord.fanout[RUNTIME_MAX_FANOUT]`,
 populated by the AICPU scheduler at the moment it wires a downstream
 consumer. If a producer has already finished and transitioned to
 `PTO2_TASK_COMPLETED` by the time a later submit wants to register a
@@ -84,7 +84,7 @@ The `--enable-l2-swimlane` flag is independent but recommended in pair
 because:
 
 - `deps.json` is the dep_gen artifact.
-- `l2_perf_records.json` (from swimlane) is the timing artifact;
+- `l2_swimlane_records.json` (from swimlane) is the timing artifact;
   `merged_swimlane.json` (the Perfetto trace) uses `deps.json` for
   dependency arrows when both files exist.
 - The "fanout ⊆ deps" validation gate fires only when both files are
@@ -262,7 +262,7 @@ Node visual encoding (legend top-right of the rendered HTML):
 | Gray dashed note | alloc — task from `alloc_tensors` (got a task_id, references downstream via `owner_task_id`, but never dispatched a kernel so has no perf record) |
 
 Labels read as `(ring, local) · func_name · core_type-implicit-via-shape`.
-When a colocated `l2_perf_records.json` is present the func_id is enriched
+When a colocated `l2_swimlane_records.json` is present the func_id is enriched
 with the kernel name via the sibling `name_map_<case>.json` (written by
 SceneTest's `_dump_name_map`).
 
@@ -288,11 +288,11 @@ sources / args / slices, so the raw `edges[]` count is a superset of the
 underlying task-pair count.
 
 `deps.json` (projected) is a **superset** of the fanout edges in
-`l2_perf_records.json`:
+`l2_swimlane_records.json`:
 
 | Edge source | Captures | Drops on race? |
 | ----------- | -------- | -------------- |
-| `task.fanout[]` (L2PerfRecord) | Successors known at producer-retire time | **Yes** — sealed when producer retires |
+| `task.fanout[]` (L2SwimlaneAicpuTaskRecord) | Successors known at producer-retire time | **Yes** — sealed when producer retires |
 | `deps.json` (this feature) | Every consumer → producer reachable via tensormap / explicit_deps | No — replay sees every submit |
 
 `tests/st/a2a3/tensormap_and_ringbuffer/dep_gen_capture/test_dep_gen_capture.py`
diff --git a/docs/dfx/l2-swimlane-profiling.md b/docs/dfx/l2-swimlane-profiling.md
index 7ef434b8e..43255711a 100644
--- a/docs/dfx/l2-swimlane-profiling.md
+++ b/docs/dfx/l2-swimlane-profiling.md
@@ -45,7 +45,7 @@ available.
   `g_orch_*_cycle` counters — that's where you go for "which
   sub-step dominates overall"; the per-submit record covers
   "which submit was slow".
-- **Standard outputs** — raw `l2_perf_records.json`, plus a
+- **Standard outputs** — raw `l2_swimlane_records.json`, plus a
   Perfetto-loadable `merged_swimlane_*.json` produced by
   `swimlane_converter`.
 
@@ -91,13 +91,13 @@ python tests/st/<case>/test_<name>.py -p <platform> -d 0 --enable-l2-swimlane
 The flag sets `CallConfig::enable_l2_swimlane` to the chosen
 level. The host then allocates the per-core / per-thread shared
 region and publishes its base address through
-`kernel_args.l2_perf_data_base`. AICore writes timing into
+`kernel_args.l2_swimlane_data_base`. AICore writes timing into
 per-task WIP slots; AICPU commits the records on FIN. Per-task
 dispatch/finish timestamps and fanout are recorded only at
 level >= 2, scheduler phase records only at level >= 3, and
 orchestrator phase records only at level >= 4.
 
-The JSON output `"l2_perf_level"` field is the captured perf_level:
+The JSON output `"l2_swimlane_level"` field is the captured perf_level:
 `1` = AICore timing only, `2` = +dispatch/fanout,
 `3` = +scheduler phases, `4` = +orchestrator phases.
 
@@ -114,7 +114,7 @@ runs):
 
 ```text
 <output_prefix>/
-├── l2_perf_records.json     # raw runtime output
+├── l2_swimlane_records.json     # raw runtime output
 ├── name_map_<case>.json     # optional func_id → name mapping
 └── merged_swimlane.json     # Perfetto trace (added by converter)
 ```
@@ -122,7 +122,7 @@ runs):
 Filenames are fixed (no per-file timestamp) — the directory is the
 per-task uniqueness boundary.
 
-`l2_perf_records.json` carries the raw records — this is the file
+`l2_swimlane_records.json` carries the raw records — this is the file
 you pass to `swimlane_converter`. Important fields per task:
 
 | Field | Meaning |
@@ -157,17 +157,17 @@ unassigned).
 and produces a per-function task-execution summary:
 
 ```bash
-# Auto-detects the latest outputs/*/l2_perf_records.json
+# Auto-detects the latest outputs/*/l2_swimlane_records.json
 python -m simpler_setup.tools.swimlane_converter
 
 # Pin to a specific case + add func_id → name mapping
 python -m simpler_setup.tools.swimlane_converter \
-    outputs/<case>_<ts>/l2_perf_records.json \
+    outputs/<case>_<ts>/l2_swimlane_records.json \
     --func-names outputs/<case>_<ts>/name_map_<case>.json
 
 # Custom output path
 python -m simpler_setup.tools.swimlane_converter \
-    outputs/<case>_<ts>/l2_perf_records.json -o my_trace.json
+    outputs/<case>_<ts>/l2_swimlane_records.json -o my_trace.json
 ```
 
 The output is `outputs/<case>_<ts>/merged_swimlane.json` (or your
@@ -244,70 +244,70 @@ What the swimlane shows:
 
 ### 5.1 Common interfaces
 
-`kernel_args.l2_perf_data_base` is the single device-side handle
+`kernel_args.l2_swimlane_data_base` is the single device-side handle
 host publishes for the run. The shared region carries a fixed
-`L2PerfDataHeader` plus per-core / per-thread state (same struct
+`L2SwimlaneDataHeader` plus per-core / per-thread state (same struct
 shape on both architectures):
 
 ```text
-L2PerfDataHeader                                (host init, device R/W)
+L2SwimlaneDataHeader                                (host init, device R/W)
 ├── queues  [MAX_AICPU_THREADS][READYQUEUE_SIZE]
 ├── queue_heads / queue_tails (per-thread)
 └── num_cores
 
-L2PerfBufferState[num_cores]                    (per-core AICPU pool state)
+L2SwimlaneAicpuTaskPool[num_cores]                    (per-core AICPU pool state)
 ├── free_queue {buffer_ptrs[SLOT_COUNT], head, tail}
-├── current_buf_ptr           (AICPU active L2PerfBuffer*)
+├── current_buf_ptr           (AICPU active L2SwimlaneAicpuTaskBuffer*)
 ├── aicore_ring_ptr           (legacy; kept for ABI continuity)
 ├── total_record_count
 ├── dropped_record_count
 └── mismatch_record_count     (legacy; no longer written)
 
-L2PerfAicoreBufferState[num_cores]              (per-core AICore pool state)
+L2SwimlaneAicoreTaskPool[num_cores]              (per-core AICore pool state)
 ├── rotation {current_buf_ptr, generation}      (AICPU writes, AICore reads
 │                                                — cache-line independent)
 ├── free_queue {buffer_ptrs[SLOT_COUNT], head, tail}
 ├── total_record_count / dropped_record_count
 └── current_buf_seq
 
-[L2PerfAicoreBuffer × PLATFORM_AICORE_BUFFERS_PER_CORE per core]
-└── L2PerfAicoreRecord records[PLATFORM_AICORE_BUFFER_SIZE]  (1024 records, 32B each)
+[L2SwimlaneAicoreTaskBuffer × PLATFORM_AICORE_BUFFERS_PER_CORE per core]
+└── L2SwimlaneAicoreTaskRecord records[PLATFORM_AICORE_BUFFER_SIZE]  (1024 records, 32B each)
 
-[AicpuPhaseHeader + PhaseBufferState[num_threads]]  (optional)
+[L2SwimlaneAicpuPhaseHeader + L2SwimlaneAicpuPhasePool[num_threads]]  (optional)
 ├── magic / num_sched_threads
 ├── core_to_thread[]  (core_id → scheduler thread index)
-└── per-thread phase buffers (PhaseBufferState aliases L2PerfBufferState)
+└── per-thread phase buffers (L2SwimlaneAicpuPhasePool aliases L2SwimlaneAicpuTaskPool)
 ```
 
 The records themselves are identical across architectures:
 
-- `L2PerfRecord` — per-task AICPU-owned fields (task_id, dispatch_time,
+- `L2SwimlaneAicpuTaskRecord` — per-task AICPU-owned fields (task_id, dispatch_time,
   finish_time, func_id, core_type, reg_task_id), 64-byte aligned.
   `reg_task_id` is the join key against the matching AICore record.
-- `L2PerfAicoreRecord` — slim AICore-only record (start, end, task_id),
+- `L2SwimlaneAicoreTaskRecord` — slim AICore-only record (start, end, task_id),
   32 bytes; AICore writes one per task into its currently-active
   per-core buffer.
-- `AicpuPhaseRecord` — per-iteration scheduler / orchestrator
+- `L2SwimlaneAicpuPhaseRecord` — per-iteration scheduler / orchestrator
   phase, 40 bytes.
 
 This is the key reason a single `swimlane_converter` consumes
 both architectures' output unchanged. Orchestrator timing is carried
-by per-submit `AicpuPhaseRecord` entries (ORCH_SUBMIT, folded from
+by per-submit `L2SwimlaneAicpuPhaseRecord` entries (ORCH_SUBMIT, folded from
 the historical per-sub-step records); there is no separate
 shared-memory aggregate. The run-window envelope is emitted to device
 log via `LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…"`.
 
 **Producer/consumer protocol on AICore (AICore-as-producer with rotation).**
-AICore writes a slim `L2PerfAicoreRecord` into its currently-active per-core
-`L2PerfAicoreBuffer` at `records[slot_within_buf++]`. The active buffer is
-published via a per-core `AicoreRotation` cache line (`current_buf_ptr` +
+AICore writes a slim `L2SwimlaneAicoreTaskRecord` into its currently-active per-core
+`L2SwimlaneAicoreTaskBuffer` at `records[slot_within_buf++]`. The active buffer is
+published via a per-core `L2SwimlaneAicoreRotation` cache line (`current_buf_ptr` +
 `generation`); AICore `dcci`'s it per task — cheap relative to the
 baseline `dcci(payload, ENTIRE_DATA_CACHE)` it already pays per task.
 AICPU drives rotation: immediately before each `write_reg(DATA_MAIN_BASE)`
 for task `K`, if `K % PLATFORM_AICORE_BUFFER_SIZE == 0`, AICPU enqueues
 the current buffer to the per-thread ready queue (kind `is_phase=2`),
-pops the next from `L2PerfAicoreBufferState::free_queue`, and bumps
-`AicoreRotation::generation`. AICore detects the bumped generation on
+pops the next from `L2SwimlaneAicoreTaskPool::free_queue`, and bumps
+`L2SwimlaneAicoreRotation::generation`. AICore detects the bumped generation on
 its next task's `dcci`, refreshes its local cache, and resets its slot
 counter to 0.
 
@@ -334,18 +334,18 @@ sched overhead per session as price for unbounded session length).
 
 `halHostRegister` maps device memory into host virtual address
 space so the host can read device buffers directly.
-`L2PerfCollector` runs two background threads on top of a
-[`BufferPoolManager<L2PerfModule>`](../src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h):
+`L2SwimlaneCollector` runs two background threads on top of a
+[`BufferPoolManager<L2SwimlaneModule>`](../src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h):
 a mgmt thread that polls SPSC ready queues and recycles full
 buffers **while kernels are still executing**, plus a poll
 thread that drains the L2 hand-off queue into
 `on_buffer_collected`.
 
-`L2PerfModule` declares two buffer kinds going through one ready
+`L2SwimlaneModule` declares two buffer kinds going through one ready
 queue per AICPU thread:
 
-- **kind 0**: per-core `L2PerfBuffer` (task records).
-- **kind 1**: per-thread `PhaseBuffer` (scheduler / orchestrator
+- **kind 0**: per-core `L2SwimlaneAicpuTaskBuffer` (task records).
+- **kind 1**: per-thread `L2SwimlaneAicpuPhaseBuffer` (scheduler / orchestrator
   phase records).
 
 The `is_phase` flag on each `ReadyQueueEntry` picks between them.
@@ -355,7 +355,7 @@ and TensorDump are single-kind.
 ```text
         HOST                                         DEVICE
 ┌──────────────────────────┐               ┌──────────────────────────┐
-│ L2PerfCollector          │               │ AICPU + AICore           │
+│ L2SwimlaneCollector          │               │ AICPU + AICore           │
 │                          │               │                          │
 │ initialize(prefix)       │  alloc +      │ AICore on task end:      │
 │   rtMalloc + halRegister │──register────>│   write timing into      │
@@ -365,14 +365,14 @@ and TensorDump are single-kind.
 │ start(tf)                │               │   commit ring slot →     │
 │   ┌────────────────────┐ │ SPSC ready    │     records[count],      │
 │   │ mgmt thread        │ │ queues        │   fill func_id /         │
-│   │ (BufferPool driver)│ │<──L2Perf──────│   dispatch / finish /    │
+│   │ (BufferPool driver)│ │<──L2Swimlane──────│   dispatch / finish /    │
 │   │   poll ready queue │<┼──+ Phase─────<│   fanout; rotate buffer  │
 │   │   recycle buffers  │─┼──free queue──>│   when full              │
 │   └────────────────────┘ │               │ AICPU scheduler thread:  │
 │   ┌────────────────────┐ │               │   per-loop-iter:         │
 │   │ poll thread        │ │               │     write AicpuPhase-    │
 │   │   reads via host   │ │ shared mem    │     Record into          │
-│   │   mapping; copies  │<┼──mapping─────<│     PhaseBuffer          │
+│   │   mapping; copies  │<┼──mapping─────<│     L2SwimlaneAicpuPhaseBuffer          │
 │   │   to host vectors  │ │               │                          │
 │   └────────────────────┘ │               │                          │
 │ stop()                   │               │                          │
@@ -380,16 +380,16 @@ and TensorDump are single-kind.
 │ read_phase_header_metadata()             │                          │
 │ reconcile_counters()     │               │                          │
 │ export_swimlane_json()   │               │                          │
-│   → l2_perf_records.json │               │                          │
+│   → l2_swimlane_records.json │               │                          │
 └──────────────────────────┘               └──────────────────────────┘
 ```
 
 **Lifecycle** (`device_runner.cpp`):
 
 ```text
-init_l2_perf()
-  l2_perf_collector_.initialize(num_aicore, ..., output_prefix_)
-  kernel_args_.args.l2_perf_data_base = l2_perf_collector_.get_l2_perf_shm_device_ptr()
+init_l2_swimlane()
+  l2_swimlane_collector_.initialize(num_aicore, ..., output_prefix_)
+  kernel_args_.args.l2_swimlane_data_base = l2_swimlane_collector_.get_l2_swimlane_shm_device_ptr()
 start(tf)                          ← spawn mgmt + poll threads
 launch AICPU / AICore
 rtStreamSynchronize
@@ -400,16 +400,16 @@ reconcile_counters()               ← three-bucket accounting for both
                                      PERF and PHASE pools (total /
                                      collected / dropped); any non-zero
                                      current_buf_ptr is a flush bug
-export_swimlane_json()             ← writes <output_prefix>/l2_perf_records.json
+export_swimlane_json()             ← writes <output_prefix>/l2_swimlane_records.json
 finalize(unregister, free)
 ```
 
-[`L2PerfCollector`](../src/a2a3/platform/include/host/l2_perf_collector.h)
+[`L2SwimlaneCollector`](../src/a2a3/platform/include/host/l2_swimlane_collector.h)
 on a2a3 inherits from
-[`profiling_common::ProfilerBase<L2PerfCollector, L2PerfModule>`](../src/a2a3/platform/include/host/profiling_common/profiler_base.h):
+[`profiling_common::ProfilerBase<L2SwimlaneCollector, L2SwimlaneModule>`](../src/a2a3/platform/include/host/profiling_common/profiler_base.h):
 the base class owns the mgmt thread, the poll thread, and the
-`BufferPoolManager<L2PerfModule>` they share. `L2PerfCollector`
-supplies the L2-specific pieces — the `L2PerfModule` trait
+`BufferPoolManager<L2SwimlaneModule>` they share. `L2SwimlaneCollector`
+supplies the L2-specific pieces — the `L2SwimlaneModule` trait
 (notably `kBufferKinds = 2` and `kind_of()`), `initialize` that
 allocates and pre-fills both kinds of free queues, an
 `on_buffer_collected` callback that branches on
@@ -423,8 +423,8 @@ framework reference.
 
 ### 5.3 a5 — same framework, host-shadow transport
 
-a5's `L2PerfCollector` derives from
-`ProfilerBase<L2PerfCollector, L2PerfModule>` and shares the
+a5's `L2SwimlaneCollector` derives from
+`ProfilerBase<L2SwimlaneCollector, L2SwimlaneModule>` and shares the
 mgmt + poll thread structure with a2a3. The single behavioral
 deviation from §5.2 is the **transport channel**: a5 has no
 `halHostRegister`, so each device buffer is paired with a
@@ -432,32 +432,32 @@ host-shadow `malloc()` and the mgmt loop synchronizes the two via
 `profiling_copy.h` (`rtMemcpy` onboard, plain `memcpy` in sim).
 
 The AICore-side write target is a per-core, **stable**
-`L2PerfAicoreRing` (`dual_issue_slots[PLATFORM_L2_AICORE_RING_SIZE]`)
+`L2SwimlaneAicoreRing` (`dual_issue_slots[PLATFORM_L2_AICORE_RING_SIZE]`)
 allocated once by the host and addressed via
-`L2PerfBufferState::aicore_ring_ptr` (AICPU side) and
-`KernelArgs::aicore_l2_perf_ring_addrs[block_idx]` forwarded into
-`set_aicore_l2_perf_ring()` by `KERNEL_ENTRY` (AICore side). The ring
+`L2SwimlaneAicpuTaskPool::aicore_ring_ptr` (AICPU side) and
+`KernelArgs::aicore_l2_swimlane_ring_addrs[block_idx]` forwarded into
+`set_aicore_l2_swimlane_ring()` by `KERNEL_ENTRY` (AICore side). The ring
 address never changes during a run, so AICore's write address is
-decoupled from the AICPU's rotating `L2PerfBuffer`. Buffer rotation is
-internal to `l2_perf_aicpu_complete_record` when `records[count]` hits
+decoupled from the AICPU's rotating `L2SwimlaneAicpuTaskBuffer`. Buffer rotation is
+internal to `l2_swimlane_aicpu_complete_task` when `records[count]` hits
 `PLATFORM_PROF_BUFFER_SIZE`. The runtime `Handshake` carries no
 profiling fields.
 
 The framework's `MemoryOps` therefore carries five callbacks on
 a5 (`alloc` / `reg` / `free_` / `copy_to_device` /
 `copy_from_device`); the mgmt loop mirrors the entire shm region
-(`L2PerfDataHeader` + per-core `L2PerfBufferState` + per-thread
-`PhaseBufferState`) device → host at the top of every tick, then
+(`L2SwimlaneDataHeader` + per-core `L2SwimlaneAicpuTaskPool` + per-thread
+`L2SwimlaneAicpuPhasePool`) device → host at the top of every tick, then
 pushes back only the fields host actually modified (advanced
 `queue_heads[q]`, refilled `free_queue.tail` and
 `buffer_ptrs[slot]`) via `BufferPoolManager::write_range_to_device`.
 The bulk `mirror_shm_to_device` is deliberately **not** called from
 the mgmt loop: it would race with AICPU writes to device-only
 fields (`current_buf_ptr`, `total/dropped/mismatch` counters,
-`queue_tails`, `free_queue.head`, `AicpuPhaseHeader::magic`,
+`queue_tails`, `free_queue.head`, `L2SwimlaneAicpuPhaseHeader::magic`,
 `core_to_thread[]`) and roll them back to whatever the host shadow
 held at the start of the tick. Per-buffer
-payloads (`L2PerfBuffer` / `PhaseBuffer`) are pulled on demand
+payloads (`L2SwimlaneAicpuTaskBuffer` / `L2SwimlaneAicpuPhaseBuffer`) are pulled on demand
 inside `ProfilerAlgorithms::process_entry` after a popped
 ready-entry resolves to its host shadow. `BufferPoolManager`'s
 `release_owned_buffers` frees the device pointer via the
@@ -466,14 +466,14 @@ collector's `release_fn` and the paired shadow via `std::free()`.
 ```text
         HOST                                         DEVICE
 ┌──────────────────────────┐               ┌──────────────────────────┐
-│ L2PerfCollector          │               │ AICPU + AICore           │
+│ L2SwimlaneCollector          │               │ AICPU + AICore           │
 │   : ProfilerBase<...>    │               │                          │
 │                          │               │                          │
 │ initialize()             │  alloc + reg  │ AICore on task end:      │
 │   rtMalloc shm           │──+ shadow────>│   write timing into      │
-│   per-core L2PerfBuffer  │   memset 0    │   per-core ring slot     │
+│   per-core L2SwimlaneAicpuTaskBuffer  │   memset 0    │   per-core ring slot     │
 │   per-core AicoreRing    │   + push 0s   │   dual_issue_slots[      │
-│   per-thread PhaseBuffer │               │     task_id & 1]         │
+│   per-thread L2SwimlaneAicpuPhaseBuffer │               │     task_id & 1]         │
 │   register_mapping(s)    │               │                          │
 │   set_memory_context     │               │ AICPU on FIN:            │
 │                          │               │   read ring slot →       │
@@ -515,31 +515,31 @@ collector's `release_fn` and the paired shadow via `std::free()`.
 **Lifecycle** (`device_runner.cpp`):
 
 ```text
-init_l2_perf()
-  l2_perf_collector_.initialize(num_aicore, ..., output_prefix_)
-  kernel_args_.args.l2_perf_data_base = l2_perf_collector_.get_l2_perf_setup_device_ptr()
-  kernel_args_.args.aicore_l2_perf_ring_addrs =
-      l2_perf_collector_.get_aicore_ring_addrs_device_ptr()
-l2_perf_collector_.start(thread_factory)   ← mgmt + poll threads
+init_l2_swimlane()
+  l2_swimlane_collector_.initialize(num_aicore, ..., output_prefix_)
+  kernel_args_.args.l2_swimlane_data_base = l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr()
+  kernel_args_.args.aicore_l2_swimlane_ring_addrs =
+      l2_swimlane_collector_.get_aicore_ring_addrs_device_ptr()
+l2_swimlane_collector_.start(thread_factory)   ← mgmt + poll threads
 launch AICPU / AICore
 rtStreamSynchronize
-l2_perf_collector_.stop()                  ← join mgmt + poll, drain final batch
-l2_perf_collector_.read_phase_header_metadata()
-l2_perf_collector_.reconcile_counters()    ← sanity-check + 3-bucket cross-check
-l2_perf_collector_.export_swimlane_json()
-l2_perf_collector_.finalize()
+l2_swimlane_collector_.stop()                  ← join mgmt + poll, drain final batch
+l2_swimlane_collector_.read_phase_header_metadata()
+l2_swimlane_collector_.reconcile_counters()    ← sanity-check + 3-bucket cross-check
+l2_swimlane_collector_.export_swimlane_json()
+l2_swimlane_collector_.finalize()
 ```
 
-[`L2PerfCollector`](../src/a5/platform/include/host/l2_perf_collector.h)
+[`L2SwimlaneCollector`](../src/a5/platform/include/host/l2_swimlane_collector.h)
 on a5 inherits the same CRTP base
 ([`profiling_common::ProfilerBase`](../src/a5/platform/include/host/profiling_common/profiler_base.h))
 as a2a3 and parameterizes
 [`BufferPoolManager`](../src/a5/platform/include/host/profiling_common/buffer_pool_manager.h)
-with `L2PerfModule` (`kBufferKinds = 2`). The only a5-specific
+with `L2SwimlaneModule` (`kBufferKinds = 2`). The only a5-specific
 glue is the 5-callback `MemoryOps` and the per-tick shm mirror.
 
-a5's per-thread AICPU flush hooks (`l2_perf_aicpu_flush_buffers` /
-`l2_perf_aicpu_flush_phase_buffers`) are the only data path on the
+a5's per-thread AICPU flush hooks (`l2_swimlane_aicpu_flush` /
+`l2_swimlane_aicpu_flush_phase_buffers`) are the only data path on the
 records side — host never reads from `current_buf_ptr` to recover
 records. `reconcile_counters` is purely passive: it logs an error if
 any `current_buf_ptr` is non-zero with a non-empty buffer (a
@@ -551,13 +551,13 @@ PHASE), same shape as a2a3.
 
 | Aspect | a2a3 | a5 |
 | ------ | ---- | -- |
-| Record shape | identical (`L2PerfRecord` / `AicpuPhaseRecord`) | |
+| Record shape | identical (`L2SwimlaneAicpuTaskRecord` / `L2SwimlaneAicpuPhaseRecord`) | |
 | AICore WIP-slot protocol | identical | |
 | AICPU commit on FIN | identical | |
 | Buffer model | rotating pool (free + ready queues) per kind | identical |
 | Ready queue | per-AICPU-thread, multiplexes PERF + PHASE via `is_phase` | identical |
 | Host threads | mgmt + poll, streams during execution | identical |
-| Host-class shape | `ProfilerBase<L2PerfCollector, L2PerfModule>` (`kBufferKinds = 2`) | identical |
+| Host-class shape | `ProfilerBase<L2SwimlaneCollector, L2SwimlaneModule>` (`kBufferKinds = 2`) | identical |
 | Host transport | `halHostRegister` shared memory | host-shadow `malloc` + per-tick `rtMemcpy`/`memcpy` |
 | `MemoryOps` callbacks | 3 (`alloc`, `reg`, `free_`) | 5 (+ `copy_to_device`, `copy_from_device`) |
 | `reconcile_counters` | passive cross-check (collected + dropped + mismatch == device_total) | identical |
@@ -577,7 +577,7 @@ When enabled, the dominant per-task overhead is:
   ring buffer plus a few metadata fields.
 
 Per scheduler-loop iteration, AICPU also writes a 32-byte
-`AicpuPhaseRecord` per phase (4 phases × 32 B = 128 B per
+`L2SwimlaneAicpuPhaseRecord` per phase (4 phases × 40 B = 160 B per
 iteration). Both architectures drain buffers concurrently with
 execution via the mgmt + poll thread pair; a5 additionally pays
 per-tick `rtMemcpy`/`memcpy` round-trips to keep the host shadow in
@@ -607,7 +607,7 @@ benchmark is not perturbed.
 
 ### 7.2 a5
 
-- Each per-core `L2PerfBuffer` and per-thread `PhaseBuffer` is
+- Each per-core `L2SwimlaneAicpuTaskBuffer` and per-thread `L2SwimlaneAicpuPhaseBuffer` is
   fixed-size. Tasks past `PLATFORM_PROF_BUFFER_SIZE` per core (and
   phases past `PLATFORM_PHASE_RECORDS_PER_THREAD` per thread) are
   silently dropped via AICPU early return; the host surfaces the
@@ -627,7 +627,7 @@ benchmark is not perturbed.
 
 ## 8. FAQ / Debug Guide
 
-**No `l2_perf_records.json` produced.** Check that
+**No `l2_swimlane_records.json` produced.** Check that
 `--enable-l2-swimlane` was passed. Verify `<output_prefix>` exists
 in the run log; if `--rounds > 1`, only the first round records.
 
@@ -636,7 +636,7 @@ automatically after a SceneTest with `--enable-l2-swimlane`; if it
 did not, run it manually:
 
 ```bash
-python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_records.json
+python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json
 ```
 
 **Tasks show as `func_<id>` instead of human names.** The
@@ -652,13 +652,13 @@ because the buffer pool ran out. On a2a3 check
 
 **`current_buf_ptr` non-empty at finalize on a2a3.** The host logs
 this as ERROR and does not recover. AICPU did not flush its
-active L2 perf buffer at run end. Check the AICPU flush path runs
+active L2 swimlane buffer at run end. Check the AICPU flush path runs
 for every thread that produced records.
 
 **Phase records empty.** Either the runtime did not emit phase
 data (only `tensormap_and_ringbuffer` does, and only when
-`AicpuPhaseHeader::magic == AICPU_PHASE_MAGIC`), or the host's
-`AicpuPhaseHeader` was not initialized. Verify the runtime sets
+`L2SwimlaneAicpuPhaseHeader::magic == L2_SWIMLANE_AICPU_PHASE_MAGIC`), or the host's
+`L2SwimlaneAicpuPhaseHeader` was not initialized. Verify the runtime sets
 the magic in its scheduler init path.
 
 **`dispatch_time_us` < `finish_time_us` mismatch.** Verify the runtime
@@ -670,7 +670,7 @@ wrote the WIP slot but AICPU never committed.
 **Scheduler-overhead deep-dive missing from converter output.**
 The converter runs `sched_overhead_analysis` only when a device
 log is resolvable. Pass `-d <device-id>` or place a `device-*`
-log under `outputs/` close in time to the `l2_perf_records.json`
+log under `outputs/` close in time to the `l2_swimlane_records.json`
 mtime; see `simpler_setup/tools/README.md` for the resolver
 rules.
 
diff --git a/docs/dfx/pmu-profiling.md b/docs/dfx/pmu-profiling.md
index 12f091e1d..8f86ddfb7 100644
--- a/docs/dfx/pmu-profiling.md
+++ b/docs/dfx/pmu-profiling.md
@@ -301,7 +301,7 @@ shared-memory layout, an `init()` that allocates and pre-fills the free
 queues, an `on_buffer_collected()` callback that appends records to the
 CSV, and `reconcile_counters()` / `finalize()`. The mgmt/poll threading,
 buffer pooling, and `Module` trait pattern are shared with TensorDump
-and L2Perf — see [profiling-framework.md](../profiling-framework.md) for
+and L2Swimlane — see [profiling-framework.md](../profiling-framework.md) for
 the framework reference.
 
 ### 5.3 a5 — same framework, host-shadow transport (DAV_3510, 10 counters)
diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md
index 60f3bd8ba..c97b8b3d7 100644
--- a/docs/dfx/tensor-dump.md
+++ b/docs/dfx/tensor-dump.md
@@ -432,7 +432,7 @@ allocates and pre-fills free queues, an `on_buffer_collected`
 callback that gathers payload bytes into the in-memory record
 list, plus `reconcile_counters` / `export_dump_files` /
 `finalize`. The mgmt/poll threading, buffer pooling, and `Module`
-trait pattern are shared with PMU and L2Perf — see
+trait pattern are shared with PMU and L2Swimlane — see
 [profiling-framework.md](../profiling-framework.md) for the
 framework reference.
 
diff --git a/docs/hardware/cache-coherency.md b/docs/hardware/cache-coherency.md
index 80a451351..8605519e9 100644
--- a/docs/hardware/cache-coherency.md
+++ b/docs/hardware/cache-coherency.md
@@ -80,19 +80,19 @@ Two separate concerns, often conflated:
   stale value from a previous round). The AICPU side must emit
   `rmb()` between the COND check and the slot reads.
 
-Concretely, the L2 perf staging-slot read in
-`src/{a2a3,a5}/platform/src/aicpu/l2_perf_collector_aicpu.cpp` does
+Concretely, the L2 swimlane staging-slot read in
+`src/{a2a3,a5}/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp` does
 **not** call `cache_invalidate_range` on the slot, but it **does** call
 `rmb()` before reading `slot->task_id` and the timing fields. All of
 those fields are AICore writes covered by the AICore-side `dcci` in
-`l2_perf_aicore_record_task`. The same pattern applies to the PMU
+`l2_swimlane_aicore_record_task`. The same pattern applies to the PMU
 staging slot
 (`src/{a2a3,a5}/platform/src/aicpu/pmu_collector_aicpu.cpp`).
 
 ### Historical pitfall
 
 PR #540 (2026-04-15) added `cache_invalidate_range(slot, 64)` on the
-AICPU side of the L2 perf staging slot, mirroring the
+AICPU side of the L2 swimlane staging slot, mirroring the
 host-DMA-protocol pattern from PR #204. The two situations are
 **not** the same: host DMA bypasses the AICPU cache; AICore stores
 plus `dcci` do not. The cache invalidate was redundant — but the
@@ -171,11 +171,11 @@ forever once they ship.
 
 - `src/{a2a3,a5}/platform/onboard/aicpu/cache_ops.cpp` — `cache_invalidate_range` implementation (`dc civac` / `dsb sy` / `isb`).
 - `src/{a2a3,a5}/platform/sim/aicpu/cache_ops.cpp` — sim no-op.
-- AICore-side `dcci` usage lives in the L2 perf / PMU AICore collectors and any kernel that publishes to a GM slot AICPU reads.
+- AICore-side `dcci` usage lives in the L2 swimlane / PMU AICore collectors and any kernel that publishes to a GM slot AICPU reads.
 
 ## Related docs
 
 - [PMU staging-slot ordering](../dfx/pmu-profiling.md) —
   detailed AICore-side `dcci` + barrier order for staging-slot writes.
 - [L2 swimlane profiling](../dfx/l2-swimlane-profiling.md) —
-  the consumer of the rules above on the L2 perf path.
+  the consumer of the rules above on the L2 swimlane path.
diff --git a/docs/profiling-framework.md b/docs/profiling-framework.md
index 54ca1a2ec..2f3d61682 100644
--- a/docs/profiling-framework.md
+++ b/docs/profiling-framework.md
@@ -1,6 +1,6 @@
 # Profiling Framework
 
-Shared host-side infrastructure that the PMU, L2Perf, and TensorDump
+Shared host-side infrastructure that the PMU, L2Swimlane, and TensorDump
 collectors are built on. Each architecture maintains its own copy of the
 framework headers under `src/<arch>/platform/include/host/profiling_common/`
 ([a2a3](../src/a2a3/platform/include/host/profiling_common/),
@@ -25,7 +25,7 @@ Each profiling subsystem on a2a3 needs the same plumbing on the host:
 - A collector thread that drains the host-side hand-off queue and copies
   records out of each ready buffer.
 - A pool of pre-registered device buffers (allocated up-front, refilled on
-  demand) keyed by "kind" — PMU has 1 kind, TensorDump has 1, L2Perf has 2
+  demand) keyed by "kind" — PMU has 1 kind, TensorDump has 1, L2Swimlane has 2
   (perf records + phase markers).
 - A dev↔host pointer map so the management thread can resolve a device
   pointer popped off a ready queue to the host-mapped pointer the collector
@@ -40,7 +40,7 @@ a small per-subsystem trait.
 
 ```text
                 ┌──────────────────────────────────────────┐
-                │  PmuCollector / L2PerfCollector /        │  Derived (CRTP)
+                │  PmuCollector / L2SwimlaneCollector /        │  Derived (CRTP)
                 │  TensorDumpCollector                     │  ─ on_buffer_collected
                 └─────────────┬────────────────────────────┘  ─ kIdleTimeoutSec / kSubsystemName
                               │ public ProfilerBase<Derived, Module>
@@ -58,7 +58,7 @@ a small per-subsystem trait.
                               ▲
                               │ Module trait wires layout into algorithms
               ┌───────────────┴────────────────┐
-              │  PmuModule / L2PerfModule /    │  Pure static trait (no state)
+              │  PmuModule / L2SwimlaneModule /    │  Pure static trait (no state)
               │  DumpModule                    │  ─ DataHeader / ReadyEntry / FreeQueue
               └────────────────────────────────┘  ─ kBufferKinds / kReadyQueueSize
                                                   ─ resolve_entry / for_each_instance
@@ -129,7 +129,7 @@ is where the unified algorithms live:
 
 ### 3.3 `Module` — trait layer
 
-A stateless `struct` per subsystem (`PmuModule`, `L2PerfModule`,
+A stateless `struct` per subsystem (`PmuModule`, `L2SwimlaneModule`,
 `DumpModule`) that tells the generic algorithms what the shared-memory
 layout looks like. The contract lives in the docblock at the top of
 [`profiler_base.h`](../src/a2a3/platform/include/host/profiling_common/profiler_base.h);
@@ -138,7 +138,7 @@ the required members are:
 | Member | Purpose |
 | ------ | ------- |
 | `using DataHeader / ReadyEntry / ReadyBufferInfo / FreeQueue` | Layout types |
-| `kBufferKinds` (PMU=1, Dump=1, L2Perf=2) | Number of per-kind recycled pools |
+| `kBufferKinds` (PMU=1, Dump=1, L2Swimlane=2) | Number of per-kind recycled pools |
 | `kReadyQueueSize`, `kSlotCount` | AICPU ready queue / free queue depth |
 | `kSubsystemName` | Tag used in framework log lines |
 | `header_from_shm(void*) → DataHeader*` | Cast shared-memory base to header |
@@ -149,7 +149,7 @@ the required members are:
 
 The Module structs are defined alongside their collectors in
 [pmu_collector.h](../src/a2a3/platform/include/host/pmu_collector.h),
-[l2_perf_collector.h](../src/a2a3/platform/include/host/l2_perf_collector.h),
+[l2_swimlane_collector.h](../src/a2a3/platform/include/host/l2_swimlane_collector.h),
 and [tensor_dump_collector.h](../src/a2a3/platform/include/host/tensor_dump_collector.h)
 — each is a few dozen lines of static methods over the subsystem's own
 `DataHeader` / ringbuffer types.
@@ -168,7 +168,7 @@ and only has to provide:
   the collector loop. Use the subsystem's `PLATFORM_*_TIMEOUT_SECONDS`
   constant.
 - `static constexpr const char* kSubsystemName` — appears in the idle
-  timeout log line (e.g. `"PMU"`, `"L2Perf"`, `"TensorDump"`).
+  timeout log line (e.g. `"PMU"`, `"L2Swimlane"`, `"TensorDump"`).
 - `init(...)` and `finalize(...)` — domain-specific setup/teardown.
   `init` must call `set_memory_context()` on the success path so
   `start(tf)` is not a no-op. `finalize` must release framework-owned
@@ -297,7 +297,7 @@ Existing collectors are the canonical examples:
   — single kind, per-core instances. See [pmu-profiling.md](dfx/pmu-profiling.md).
 - [`TensorDumpCollector`](../src/a2a3/platform/include/host/tensor_dump_collector.h)
   — single kind, per-AICPU-thread instances. See [tensor-dump.md](dfx/tensor-dump.md).
-- [`L2PerfCollector`](../src/a2a3/platform/include/host/l2_perf_collector.h)
+- [`L2SwimlaneCollector`](../src/a2a3/platform/include/host/l2_swimlane_collector.h)
   — two kinds (perf records + phase markers), per-core / per-thread
   instances; the canonical multi-kind example. See
   [l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md).
@@ -332,8 +332,8 @@ changes capture that:
    **not** called from the mgmt loop — it would race with AICPU writes
    to device-only fields (`current_buf_ptr`, `total/dropped/mismatch`
    counters, `queue_tails`, `free_queue.head`,
-   `AicpuPhaseHeader::magic`, `core_to_thread[]`), rolling them back
-   to whatever the host shadow had at the start of the tick. Per-buffer payloads (`L2PerfBuffer` / `PmuBuffer` /
+   `L2SwimlaneAicpuPhaseHeader::magic`, `core_to_thread[]`), rolling them back
+   to whatever the host shadow had at the start of the tick. Per-buffer payloads (`L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` /
    `DumpMetaBuffer`) are still pulled on demand inside
    `ProfilerAlgorithms::process_entry` after resolving the host pointer
    for a popped ready entry. The bulk `mirror_shm_to_device` is kept
@@ -363,7 +363,7 @@ per-core ring/reg addresses travel through `KernelArgs`:
 | `KernelArgs` field | Producer | Consumer |
 | ------------------ | -------- | -------- |
 | `enable_profiling_flag` (bitmask) | host (DeviceRunner) | AICPU `kernel.cpp` → `set_l2_swimlane_enabled` / `set_pmu_enabled` / `set_dump_tensor_enabled`; AICore `KERNEL_ENTRY` → `set_aicore_profiling_flag` |
-| `aicore_l2_perf_ring_addrs` (table) | host (`L2PerfCollector::initialize`) | AICore `KERNEL_ENTRY` indexes `table[block_idx]` → `set_aicore_l2_perf_ring` |
+| `aicore_l2_swimlane_ring_addrs` (table) | host (`L2SwimlaneCollector::initialize`) | AICore `KERNEL_ENTRY` indexes `table[block_idx]` → `set_aicore_l2_swimlane_ring` |
 | `aicore_pmu_ring_addrs` (table) | host (`PmuCollector::init`) | AICore `KERNEL_ENTRY` → `set_aicore_pmu_ring` |
 | `regs` (per-physical-core register-base table) | host (already required for AICPU MMIO) | AICore `KERNEL_ENTRY` resolves `regs[get_physical_core_id()]` → `set_aicore_pmu_reg_base`; AICore `aicore_execute` caches the value at Phase-3 |
 
@@ -376,16 +376,16 @@ state surface, never the runtime protocol.
 
 ### 8.2 Stable AICore staging ring (decouples AICore write from AICPU buffer rotation)
 
-L2Perf and PMU on a5 both use the "AICore writes, AICPU commits" model.
+L2Swimlane and PMU on a5 both use the "AICore writes, AICPU commits" model.
 The AICore-side write target is a per-core
-[`L2PerfAicoreRing`](../src/a5/platform/include/common/l2_perf_profiling.h) /
+[`L2SwimlaneAicoreRing`](../src/a5/platform/include/common/l2_swimlane_profiling.h) /
 [`PmuAicoreRing`](../src/a5/platform/include/common/pmu_profiling.h) of
 `PLATFORM_{L2,PMU}_AICORE_RING_SIZE` (= 2, dual-issue) slots, allocated
 once by the host and addressed by
 `BufferState::aicore_ring_ptr` (AICPU-visible) and the per-core
 `aicore_*_ring_addrs[block_idx]` (AICore-visible). The address is
 never reassigned, so AICore's write target is stable across AICPU's
-rotating `L2PerfBuffer` / `PmuBuffer` flips — flipping is now
+rotating `L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` flips — flipping is now
 fully internal to `*_complete_record` and never crosses into Handshake.
 
 Everything else — Module concept contract, alloc policy
diff --git a/docs/profiling-name-map.md b/docs/profiling-name-map.md
index 105ba2fee..1fff59e9c 100644
--- a/docs/profiling-name-map.md
+++ b/docs/profiling-name-map.md
@@ -2,7 +2,7 @@
 
 ## Problem
 
-Profiling data (`l2_perf_records.json`) identifies tasks by numeric IDs
+Profiling data (`l2_swimlane_records.json`) identifies tasks by numeric IDs
 (e.g., `func_id: 0`).  Without a mapping, swimlane visualizations show
 opaque labels like `func_0_a(t0)` instead of human-readable names like
 `QK(t0)`.
@@ -45,7 +45,7 @@ Every level uses the same structure:
 ### L2 (Orchestration + Incores)
 
 `callable_id` = incore `func_id` (the integer assigned in the CALLABLE
-spec).  These are the same IDs that appear in L2 perf data.
+spec).  These are the same IDs that appear in L2 swimlane data.
 
 ```json
 {
@@ -147,10 +147,10 @@ takes precedence over `-k` (kernel_config.py):
 # Automatic (via SceneTest profiling)
 pytest tests/st/... --platform a5onboard --enable-l2-swimlane
 
-# Manual (paths land alongside l2_perf_records.json inside the same
+# Manual (paths land alongside l2_swimlane_records.json inside the same
 # <output_prefix> directory)
 python -m simpler_setup.tools.swimlane_converter \
-    outputs/<case>_<ts>/l2_perf_records.json \
+    outputs/<case>_<ts>/l2_swimlane_records.json \
     --func-names outputs/<case>_<ts>/name_map_TestPA_basic.json
 
 python -m simpler_setup.tools.deps_to_graph \
@@ -169,7 +169,7 @@ cannot collide.
 
 ```text
 outputs/TestPA_basic_20260416_151301/
-  l2_perf_records.json         # perf data (runtime)
+  l2_swimlane_records.json         # perf data (runtime)
   name_map_TestPA_basic.json   # name mapping (SceneTest)
   merged_swimlane.json         # Perfetto trace (converter)
 ```
diff --git a/docs/sim_multi_device_isolation.md b/docs/sim_multi_device_isolation.md
index 09247e0a6..46c5b5d49 100644
--- a/docs/sim_multi_device_isolation.md
+++ b/docs/sim_multi_device_isolation.md
@@ -24,7 +24,7 @@ Communication uses a 4096-byte shared-memory mailbox per chip — the same layou
 
 ## Why Not Fix the Globals
 
-The global state in `host_runtime.so` spans multiple files (`cpu_sim_context.cpp`, `platform_aicpu_affinity.cpp`, `l2_perf_collector_aicpu.cpp`, `device_log.cpp`) and is deeply embedded in the AICPU/AICore thread model. Fixing each one individually is fragile. Process isolation solves all of them at once with zero platform code changes.
+The global state in `host_runtime.so` spans multiple files (`cpu_sim_context.cpp`, `platform_aicpu_affinity.cpp`, `l2_swimlane_collector_aicpu.cpp`, `device_log.cpp`) and is deeply embedded in the AICPU/AICore thread model. Fixing each one individually is fragile. Process isolation solves all of them at once with zero platform code changes.
 
 ## Files
 
diff --git a/docs/testing.md b/docs/testing.md
index c7c9fd735..68f4f9888 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -104,7 +104,7 @@ python test_xxx.py -p a2a3sim --log-level debug                  # verbose C++ l
 | `--case SEL` | | (all) | Case selector, repeatable: `Foo`, `ClassA::Foo`, `ClassA::` |
 | `--manual` | | `exclude` | `exclude`/`include`/`only` for manual cases |
 | `--skip-golden` | | false | Skip golden comparison (for benchmarking) |
-| `--enable-l2-swimlane [PERF_LEVEL]` | | `0` | Enable L2 swimlane collection on first round only. The flag takes an integer perf_level 0–4 (bare = 4); see [docs/dfx/l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md#31-enable-l2-swimlane) for the level table. Each test case gets its own `outputs/<case>_<ts>/` directory under which `l2_perf_records.json` lands; parallel runs never collide. |
+| `--enable-l2-swimlane [PERF_LEVEL]` | | `0` | Enable L2 swimlane collection on first round only. The flag takes an integer perf_level 0–4 (bare = 4); see [docs/dfx/l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md#31-enable-l2-swimlane) for the level table. Each test case gets its own `outputs/<case>_<ts>/` directory under which `l2_swimlane_records.json` lands; parallel runs never collide. |
 | `--dump-tensor` | | false | Dump per-task tensor I/O during runtime execution |
 | `--enable-pmu [EVENT_TYPE]` | | `0` | Enable a2a3 PMU CSV collection. Bare flag selects `PIPE_UTILIZATION` (`2`); pass an event type such as `4` for `MEMORY`. |
 | `--exitfirst` | `-x` | false | Stop on first failing test (fail-fast, primarily for CI) |
@@ -318,13 +318,13 @@ A single file can declare both L2 and L3 classes; they're grouped by `(runtime,
 
 Each test case sets its own `CallConfig.output_prefix` (chosen by `scene_test.py::_build_output_prefix` as `outputs/<ClassName>_<case>_<YYYYMMDD_HHMMSS>/`). The C++ runtime writes all diagnostic artifacts under that prefix with fixed filenames:
 
-- `outputs/<case>_<ts>/l2_perf_records.json` — swimlane (`--enable-l2-swimlane`)
+- `outputs/<case>_<ts>/l2_swimlane_records.json` — swimlane (`--enable-l2-swimlane`)
 - `outputs/<case>_<ts>/tensor_dump/` — tensor dump (`--dump-tensor`)
 - `outputs/<case>_<ts>/pmu.csv` — PMU counters (`--enable-pmu`)
 
 Because each case gets its own directory, parallel runs (xdist workers, L3 case fanout, L2 device fanout) can never collide on filename — there is no per-file timestamp, no env-var scoping, and no post-run flatten step. `CallConfig::validate()` throws if any diagnostic flag is enabled but `output_prefix` is empty; `scene_test.py::run_class_cases` always fills it from the case label.
 
-Standalone invocations of CLIs (`python -m simpler_setup.tools.swimlane_converter`, etc.) auto-detect the latest `outputs/*/l2_perf_records.json` (sorted by mtime); pass `--input <path>` to override.
+Standalone invocations of CLIs (`python -m simpler_setup.tools.swimlane_converter`, etc.) auto-detect the latest `outputs/*/l2_swimlane_records.json` (sorted by mtime); pass `--input <path>` to override.
 
 ### Dispatcher skip conditions (normal pytest runs)
 
diff --git a/examples/workers/l2/vector_add/test_run_timing.py b/examples/workers/l2/vector_add/test_run_timing.py
index a3944c087..2624c4173 100644
--- a/examples/workers/l2/vector_add/test_run_timing.py
+++ b/examples/workers/l2/vector_add/test_run_timing.py
@@ -100,7 +100,7 @@ def test_worker_run_returns_run_timing(st_platform, st_device_ids):
     # device_wall must also be > 0 without --enable-l2-swimlane after the
     # Phase B decoupling: orch_summary is written unconditionally when
     # PTO2_PROFILING is on (default build). Hitting 0 here means either:
-    #   - the AICPU's l2_perf_aicpu_write_orch_summary path regressed back
+    #   - the AICPU's l2_swimlane_aicpu_write_orch_summary path regressed back
     #     under an is_l2_swimlane_enabled() gate, or
     #   - the host stopped reading the phase header after the run.
     assert timing.device_wall_us > 0.0, (
diff --git a/scope_stats/scope_stats.jsonl b/scope_stats/scope_stats.jsonl
new file mode 100644
index 000000000..7ce6caf3f
--- /dev/null
+++ b/scope_stats/scope_stats.jsonl
@@ -0,0 +1,3 @@
+{"version": 4, "fatal": false, "dropped": 0, "total": 2, "task_window_max": [16384, 16384, 16384, 16384], "heap_max": [268435456, 268435456, 268435456, 268435456], "tensormap_max": 65536}
+{"site": "(unknown):0", "phase": "begin", "depth": 0, "ring": 0, "task_window_start": 0, "task_window_end": 0, "heap_start": 0, "heap_end": 0, "tensormap": 0}
+{"site": "(unknown):0", "phase": "end", "depth": 0, "ring": 0, "task_window_start": 0, "task_window_end": 1, "heap_start": 0, "heap_end": 0, "tensormap": 1}
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index 29c1c1f16..39f4891a9 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -558,7 +558,7 @@ def _build_output_prefix(case_label: str) -> Path:
     """Per-case directory for diagnostic artifacts.
 
     Each case gets its own ``outputs/<case_label>_<timestamp>/`` directory; the
-    runtime writes ``l2_perf_records.json``, ``tensor_dump/``, and ``pmu.csv``
+    runtime writes ``l2_swimlane_records.json``, ``tensor_dump/``, and ``pmu.csv``
     under that root with fixed filenames. Two cases of the same name run in
     the same second is not a contemplated scenario (parallel xdist runs differ
     by class+method).
@@ -584,7 +584,7 @@ def _run_swimlane_converter(
 
     When ``input_path`` is given, the converter derives its output filename from
     the input's timestamp (see ``swimlane_converter._resolve_output_path``).
-    Without it, the converter auto-selects the latest ``l2_perf_records_*.json``.
+    Without it, the converter auto-selects the latest ``l2_swimlane_records_*.json``.
     """
     import logging  # noqa: PLC0415
     import subprocess  # noqa: PLC0415
@@ -618,13 +618,13 @@ def _convert_case_swimlane(
     callable_spec: dict | None = None,
 ) -> None:
     """Post-case: invoke the swimlane converter on the perf file the runtime
-    just wrote into ``<output_prefix>/l2_perf_records.json``. No diff/rename
+    just wrote into ``<output_prefix>/l2_swimlane_records.json``. No diff/rename
     dance — the path is known a priori from CallConfig.output_prefix.
     """
     import logging  # noqa: PLC0415
 
     logger = logging.getLogger(__name__)
-    perf_file = output_prefix / "l2_perf_records.json"
+    perf_file = output_prefix / "l2_swimlane_records.json"
     if not perf_file.exists():
         logger.warning(f"[{case_label}] {perf_file} not produced; skipping conversion")
         return
@@ -693,7 +693,7 @@ def run_class_cases(  # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI
         # Per-case directory the runtime writes into. Required (non-empty) when
         # any diagnostic flag is on; CallConfig::validate() throws otherwise.
         # scope_stats now writes <prefix>/scope_stats/scope_stats.jsonl (sibling of
-        # l2_perf_records.json / deps.json), so it pulls output_prefix the
+        # l2_swimlane_records.json / deps.json), so it pulls output_prefix the
         # same way the other DFX flags do.
         prefix = _build_output_prefix(case_label) if diagnostics_on else Path("")
         try:
diff --git a/simpler_setup/tools/README.md b/simpler_setup/tools/README.md
index a1a548440..d2e42530d 100644
--- a/simpler_setup/tools/README.md
+++ b/simpler_setup/tools/README.md
@@ -14,7 +14,7 @@ no repo checkout required.
 - **[deps_to_graph](#deps_to_graph)** — `deps.json` (dep_gen) → pan/zoom HTML dependency graph
 - **[dump_viewer](#dump_viewer)** — inspect / export tensor dumps (see [docs/tensor-dump.md](../../docs/dfx/tensor-dump.md) for full workflow)
 
-Auto-detection paths (`outputs/*/l2_perf_records.json`, `outputs/*/tensor_dump/`)
+Auto-detection paths (`outputs/*/l2_swimlane_records.json`, `outputs/*/tensor_dump/`)
 are resolved relative to the **current working directory** — run these from the
 directory that holds your `outputs/`. Each test case writes into its own
 `outputs/<case>_<ts>/` directory; the tools auto-pick the latest by mtime.
@@ -27,7 +27,7 @@ Convert performance profiling JSON files into Chrome Trace Event format for visu
 
 ### Overview
 
-Converts PTO Runtime profiling data (`l2_perf_records_*.json`) into the format used by the Perfetto trace viewer (<https://ui.perfetto.dev/>). It also produces a task execution statistics summary grouped by function and a scheduler overhead deep-dive report (the same one `sched_overhead_analysis` emits).
+Converts PTO Runtime profiling data (`l2_swimlane_records_*.json`) into the format used by the Perfetto trace viewer (<https://ui.perfetto.dev/>). It also produces a task execution statistics summary grouped by function and a scheduler overhead deep-dive report (the same one `sched_overhead_analysis` emits).
 
 ### Basic Usage
 
@@ -36,20 +36,20 @@ Converts PTO Runtime profiling data (`l2_perf_records_*.json`) into the format u
 python -m simpler_setup.tools.swimlane_converter
 
 # Specify an input file
-python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_records.json
+python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json
 
 # Specify an output file
-python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_records.json -o custom_output.json
+python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json -o custom_output.json
 
 # Load function name mapping from kernel_config.py
-python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_records.json \
+python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json \
     -k examples/host_build_graph/paged_attention/kernels/kernel_config.py
 
 # Verbose mode (for debugging)
-python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_records.json -v
+python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json -v
 
 # Reuse a deps.json captured in an earlier dep_gen run (different output dir)
-python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_records.json \
+python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json \
     --deps-json outputs/<case>_<earlier_ts>/deps.json
 ```
 
@@ -65,7 +65,7 @@ python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_rec
 
 | Option | Short | Description |
 | ------ | ----- | ----------- |
-| `input` | | Input JSON file (l2_perf_records_*.json). If omitted, the latest file in outputs/ is used |
+| `input` | | Input JSON file (l2_swimlane_records_*.json). If omitted, the latest file in outputs/ is used |
 | `--output` | `-o` | Output JSON file (default: outputs/merged_swimlane_`<timestamp>`.json) |
 | `--kernel-config` | `-k` | Path to kernel_config.py, used for function name mapping |
 | `--func-names` | | Path to func_id_names_*.json (SceneTest format) for function name mapping |
@@ -118,7 +118,7 @@ python examples/scripts/run_example.py \
 
 After the test passes, the tool will:
 
-1. Auto-detect the latest `l2_perf_records_*.json` in outputs/
+1. Auto-detect the latest `l2_swimlane_records_*.json` in outputs/
 2. Load function names from the kernel_config.py specified via `-k`
 3. Produce `merged_swimlane_*.json` for visualization
 4. Print the task statistics and scheduler overhead deep-dive report to the console
@@ -133,7 +133,7 @@ Analyze AICPU scheduler overhead and quantitatively decompose the sources of Tai
 
 `sched_overhead_analysis` reads two artifacts produced by the runtime:
 
-1. **Perf profiling data** (`l2_perf_records_*.json`, l2_perf_level >= 3): per-task Exec / Head OH / Tail OH time breakdowns plus `aicpu_scheduler_phases` — per-thread, per-loop-iteration phase records carrying scan / complete / dispatch / idle timings and per-emit pop_hit / pop_miss deltas.
+1. **Perf profiling data** (`l2_swimlane_records_*.json`, l2_swimlane_level >= 3): per-task Exec / Head OH / Tail OH time breakdowns plus `aicpu_scheduler_phases` — per-thread, per-loop-iteration phase records carrying scan / complete / dispatch / idle timings and per-emit pop_hit / pop_miss deltas.
 2. **`deps.json`** (optional, dep_gen replay output): structural task DAG. When colocated with the perf JSON, Part 2 prints per-thread fanout / fanin aggregates derived from it.
 
 ### Basic Usage
@@ -144,11 +144,11 @@ python -m simpler_setup.tools.sched_overhead_analysis
 
 # Specify the perf JSON explicitly
 python -m simpler_setup.tools.sched_overhead_analysis \
-    --l2-perf-records-json outputs/<case>_<ts>/l2_perf_records.json
+    --l2-swimlane-records-json outputs/<case>_<ts>/l2_swimlane_records.json
 
 # Override the deps.json location
 python -m simpler_setup.tools.sched_overhead_analysis \
-    --l2-perf-records-json outputs/<case>_<ts>/l2_perf_records.json \
+    --l2-swimlane-records-json outputs/<case>_<ts>/l2_swimlane_records.json \
     --deps-json outputs/<case>_<ts>/deps.json
 ```
 
@@ -156,7 +156,7 @@ python -m simpler_setup.tools.sched_overhead_analysis \
 
 | Option | Description |
 | ------ | ----------- |
-| `--l2-perf-records-json` | Path to the l2_perf_records_*.json file. If omitted, the latest file in outputs/ is auto-selected |
+| `--l2-swimlane-records-json` | Path to the l2_swimlane_records_*.json file. If omitted, the latest file in outputs/ is auto-selected |
 | `--deps-json` | Path to deps.json (dep_gen replay output) for fanout / fanin aggregates. Defaults to the deps.json sibling of the perf JSON. |
 
 ### Outputs
@@ -167,7 +167,7 @@ Output is emitted in three parts:
 - **Part 2: AICPU scheduler loop breakdown** — per-scheduler-thread loop statistics, per-phase (scan / complete / dispatch / idle) time ratios, pop_hit / pop_miss totals, and (when deps.json is available) per-thread fanout / fanin aggregates
 - **Part 3: Tail OH distribution & cause analysis** — Tail OH quantile distribution (P10–P99), correlation between scheduler loop iteration time and Tail OH, and data-driven insights into the dominant phase
 
-The perf JSON must be captured at l2_perf_level >= 3 so that `aicpu_scheduler_phases` is non-empty (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing).
+The perf JSON must be captured at l2_swimlane_level >= 3 so that `aicpu_scheduler_phases` is non-empty (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing).
 
 ---
 
@@ -279,11 +279,11 @@ python -m simpler_setup.tools.dump_viewer outputs/<case>_<ts>/tensor_dump/ --ind
 
 ### Input File Format
 
-The analysis tools share the same input format - the `l2_perf_records_*.json` files generated by the PTO Runtime:
+The analysis tools share the same input format - the `l2_swimlane_records_*.json` files generated by the PTO Runtime:
 
 ```json
 {
-  "l2_perf_level": 4,
+  "l2_swimlane_level": 4,
   "tasks": [
     {
       "task_id": 0,
@@ -320,9 +320,9 @@ The analysis tools share the same input format - the `l2_perf_records_*.json` fi
 Dependency edges come from `deps.json` (dep_gen replay) at post-process time —
 not from the perf JSON. See [`swimlane_converter --deps-json`](#swimlane_converter).
 
-Top-level layout depends on `l2_perf_level`:
+Top-level layout depends on `l2_swimlane_level`:
 
-- All levels: `l2_perf_level`, `tasks[]` (per-task fields above).
+- All levels: `l2_swimlane_level`, `tasks[]` (per-task fields above).
 - `>= 3`: also `aicpu_scheduler_phases[]` (per-thread phase records:
   scan / complete / dispatch / idle) and `core_to_thread[]` (core_id →
   scheduler thread index).
@@ -398,7 +398,7 @@ For batch-run hardware regression, see the dev-only script
 
 ## Troubleshooting
 
-### Error: cannot find l2_perf_records_*.json file
+### Error: cannot find l2_swimlane_records_*.json file
 
 - Make sure the test was run with the `--enable-l2-swimlane` flag
 - Check that the outputs/ directory exists and contains profiling data
@@ -408,18 +408,18 @@ For batch-run hardware regression, see the dev-only script
 - Check the kernel_config.py file format
 - Make sure every KERNELS entry has a 'func_id' and 'name' field
 
-### Error: Unsupported l2_perf_level
+### Error: Unsupported l2_swimlane_level
 
-- The tools accept l2_perf_level 1–4 (the integer captured at runtime
+- The tools accept l2_swimlane_level 1–4 (the integer captured at runtime
   via `--enable-l2-swimlane <N>`)
 - Regenerate the profiling data with a supported level
 
 ### Error: Perf JSON missing required fields for scheduler overhead analysis
 
-- This error means the input `l2_perf_records_*.json` lacks fields required by the deep-dive analysis (typically `dispatch_time_us` / `finish_time_us`)
+- This error means the input `l2_swimlane_records_*.json` lacks fields required by the deep-dive analysis (typically `dispatch_time_us` / `finish_time_us`)
 - The basic conversion in `swimlane_converter` can still succeed, but the deep-dive will be skipped or fail
 - Remediation:
-  1. Re-run with `--enable-l2-swimlane` to produce a new `outputs/*/l2_perf_records.json`
+  1. Re-run with `--enable-l2-swimlane` to produce a new `outputs/*/l2_swimlane_records.json`
   2. Re-run `swimlane_converter` or `sched_overhead_analysis`
   3. Verify that each task in the JSON contains `dispatch_time_us` and `finish_time_us`
 
@@ -435,7 +435,7 @@ For batch-run hardware regression, see the dev-only script
 
 | File | Tool | Purpose | Format |
 | ---- | ---- | ------- | ------ |
-| `l2_perf_records_*.json` | Runtime | Raw timing profiling data | JSON |
+| `l2_swimlane_records_*.json` | Runtime | Raw timing profiling data | JSON |
 | `merged_swimlane_*.json` | swimlane_converter | Perfetto visualization | Chrome Trace Event JSON |
 | `deps.json` | Runtime (dep_gen replay) | Structural task dependency graph + per-edge tensor info | JSON |
 | `deps_graph.html` | deps_to_graph | Pan/zoom dependency graph viewer | HTML (self-contained) |
diff --git a/simpler_setup/tools/deps_to_graph.py b/simpler_setup/tools/deps_to_graph.py
index ec185e6ce..cba7356c6 100644
--- a/simpler_setup/tools/deps_to_graph.py
+++ b/simpler_setup/tools/deps_to_graph.py
@@ -21,7 +21,7 @@
 gotcha is that high zoom slightly blurs text — that's a CSS-transform tradeoff
 in exchange for 60fps GPU-composited pan/zoom even on huge graphs.
 
-When ``l2_perf_records.json`` is colocated with ``deps.json``, node labels are
+When ``l2_swimlane_records.json`` is colocated with ``deps.json``, node labels are
 enriched with the per-task ``func_id`` and ``core_type`` so a node reads as
 ``t12 · kernel_mul · aiv`` rather than just ``t12``; nodes are colored by
 core_type (AIC blue, AIV orange).
@@ -44,7 +44,7 @@
 
 
 def _normalize_task_id(v):
-    """Unsigned 64-bit task id (matches deps.json edges and l2_perf task_id).
+    """Unsigned 64-bit task id (matches deps.json edges and l2_swimlane task_id).
 
     Accepts ints (legacy) and strings (current schema): deps.json emits all
     uint64 fields as quoted strings to dodge JSON-number precision loss in
@@ -239,7 +239,7 @@ def _backfill_output_tensor_ids(task_table, annotations):
 
 
 def _load_task_meta(deps_path, func_names=None):
-    """Optional l2_perf_records.json sidecar → {task_id: {'func_id', 'core_type', ...}}.
+    """Optional l2_swimlane_records.json sidecar → {task_id: {'func_id', 'core_type', ...}}.
 
     Mixed-kernel tasks (single submit_task that spans both AIC and AIV blocks)
     appear as multiple perf-record entries with the same ``task_id`` but
@@ -252,7 +252,7 @@ def _load_task_meta(deps_path, func_names=None):
     Returns {} if no sidecar present. ``func_names`` (optional dict) overrides
     the default ``f{func_id}`` label with a human name.
     """
-    perf_path = Path(deps_path).parent / "l2_perf_records.json"
+    perf_path = Path(deps_path).parent / "l2_swimlane_records.json"
     if not perf_path.exists():
         return {}
     try:
@@ -318,7 +318,7 @@ def _label(task_id, meta, fmt_task, have_perf=False):
 # is an ellipse; "mix" (single submit_task spanning both core types) is a
 # diamond; "alloc" — a task that came from ``alloc_tensors`` (got a real
 # task_id and shows up as a producer in deps via ``owner_task_id``, but
-# never dispatched a kernel so no l2_perf record and no func_id) — is a
+# never dispatched a kernel so no l2_swimlane record and no func_id) — is a
 # dashed gray note. Distinct shape AND color so each stays readable even
 # without color (B&W print, accessibility, etc.).
 _CORE_STYLE = {
diff --git a/simpler_setup/tools/sched_overhead_analysis.py b/simpler_setup/tools/sched_overhead_analysis.py
index 53ad97fd8..6f7579ac8 100644
--- a/simpler_setup/tools/sched_overhead_analysis.py
+++ b/simpler_setup/tools/sched_overhead_analysis.py
@@ -10,7 +10,7 @@
 """Scheduler overhead analysis for PTO2.
 
 Inputs:
-  1. Per-task perf profiling data (l2_perf_records_*.json) with
+  1. Per-task perf profiling data (l2_swimlane_records_*.json) with
      ``aicpu_scheduler_phases`` populated by ``--enable-l2-swimlane`` at
      level >= 3.
   2. deps.json (optional, dep_gen replay output) colocated with the perf JSON,
@@ -18,8 +18,8 @@
 
 Usage:
     python -m simpler_setup.tools.sched_overhead_analysis                   # auto-select latest files
-    python -m simpler_setup.tools.sched_overhead_analysis --l2-perf-records-json <path>
-    python -m simpler_setup.tools.sched_overhead_analysis --l2-perf-records-json <path> --deps-json <path>
+    python -m simpler_setup.tools.sched_overhead_analysis --l2-swimlane-records-json <path>
+    python -m simpler_setup.tools.sched_overhead_analysis --l2-swimlane-records-json <path> --deps-json <path>
 """
 
 import argparse
@@ -48,7 +48,7 @@ def compute_dag_stats_from_deps(deps_data, perf_data, threads):
 
     Why this lives in Python and not the runtime: the DAG edge set is already
     captured structurally by dep_gen (deps.json), and the per-task → scheduler-
-    thread map is in ``l2_perf_records.json::core_to_thread``. Re-instrumenting
+    thread map is in ``l2_swimlane_records.json::core_to_thread``. Re-instrumenting
     the AICPU to track fanout edge counts is duplicate work; running this in
     Python over the existing artifacts is cheaper, more accurate (deps.json
     captures #599 race-window edges that fanout[] dropped), and lets the
@@ -108,7 +108,7 @@ def task_thread(task):
     per_thread_fanin = defaultdict(lambda: {"edges": 0, "max": 0, "tasks": 0})
 
     # Dedup by task_id: mixed (AIC+AIV) tasks emit one perf row per subtask /
-    # core (see l2_perf_collector.cpp:567 — collected_perf_records_ is keyed by
+    # core (see l2_swimlane_collector.cpp:567 — collected_perf_records_ is keyed by
     # core_idx). Without dedup a mixed task's fanout would be charged once per
     # subtask, inflating per-thread edge counts by the subtask count.
     seen_task_ids = set()
@@ -145,20 +145,20 @@ def task_thread(task):
         t["fanin_max_degree"] = fi["max"]
 
 
-def auto_select_l2_perf_records_json():
-    """Find the latest outputs/<case>/l2_perf_records.json (sorted by mtime)."""
+def auto_select_l2_swimlane_records_json():
+    """Find the latest outputs/<case>/l2_swimlane_records.json (sorted by mtime)."""
     outputs_dir = Path.cwd() / "outputs"
-    files = sorted(outputs_dir.glob("*/l2_perf_records.json"), key=lambda p: p.stat().st_mtime, reverse=True)
+    files = sorted(outputs_dir.glob("*/l2_swimlane_records.json"), key=lambda p: p.stat().st_mtime, reverse=True)
     if not files:
-        raise FileNotFoundError(f"No outputs/*/l2_perf_records.json found under {outputs_dir}")
+        raise FileNotFoundError(f"No outputs/*/l2_swimlane_records.json found under {outputs_dir}")
     return files[0]
 
 
 def parse_scheduler_from_json_phases(data):
-    """Extract scheduler Phase breakdown from l2_perf_records JSON.
+    """Extract scheduler Phase breakdown from l2_swimlane_records JSON.
 
     Computes per-thread loop counts, task counts, and phase totals
-    from aicpu_scheduler_phases records (present at l2_perf_level >= 3).
+    from aicpu_scheduler_phases records (present at l2_swimlane_level >= 3).
 
     Returns:
         dict: Thread data keyed by thread index, with per-phase us / pct,
@@ -279,12 +279,12 @@ def validate_perf_tasks_for_overhead_analysis(tasks):
                 f"Missing required fields (showing up to 5 tasks): {detail}",
                 "",
                 "Why this happens:",
-                "  - The input is not a runtime-generated l2_perf_records_*.json, OR",
+                "  - The input is not a runtime-generated l2_swimlane_records_*.json, OR",
                 "  - The runtime binary does not include / emit dispatch+finish timestamps.",
                 "",
                 "How to fix:",
                 "  1) Re-run workload with profiling enabled (e.g. run_example.py --enable-l2-swimlane).",
-                "  2) Pass the newly generated outputs/<case>/l2_perf_records.json via --l2-perf-records-json.",
+                "  2) Pass the newly generated outputs/<case>/l2_swimlane_records.json via --l2-swimlane-records-json.",
                 "  3) Verify each task includes dispatch_time_us and finish_time_us.",
                 "",
                 "Note:",
@@ -297,7 +297,7 @@ def validate_perf_tasks_for_overhead_analysis(tasks):
 
 
 def run_analysis(  # noqa: PLR0912, PLR0915
-    l2_perf_records_path,
+    l2_swimlane_records_path,
     print_sources=True,
     deps_json_path=None,
     perf_data=None,
@@ -305,7 +305,7 @@ def run_analysis(  # noqa: PLR0912, PLR0915
     """Run scheduler overhead analysis report.
 
     Args:
-        l2_perf_records_path: Path to l2_perf_records_*.json.
+        l2_swimlane_records_path: Path to l2_swimlane_records_*.json.
         print_sources: Whether to print selected input files.
         perf_data: Optional pre-parsed perf JSON dict. When provided, skip
             re-reading from disk — main() already parses the file to probe
@@ -318,20 +318,20 @@ def run_analysis(  # noqa: PLR0912, PLR0915
     Returns:
         int: 0 on success, non-zero on failure.
     """
-    l2_perf_records_path = Path(l2_perf_records_path)
+    l2_swimlane_records_path = Path(l2_swimlane_records_path)
 
-    if not l2_perf_records_path.exists():
-        print(f"Error: Perf JSON not found: {l2_perf_records_path}", file=sys.stderr)
+    if not l2_swimlane_records_path.exists():
+        print(f"Error: Perf JSON not found: {l2_swimlane_records_path}", file=sys.stderr)
         return 1
 
     # Auto-discover deps.json sibling when caller didn't specify one.
     if deps_json_path is None:
-        sibling = l2_perf_records_path.parent / "deps.json"
+        sibling = l2_swimlane_records_path.parent / "deps.json"
         if sibling.exists():
             deps_json_path = sibling
 
     if print_sources:
-        print(f"Perf data:  {l2_perf_records_path}")
+        print(f"Perf data:  {l2_swimlane_records_path}")
         if deps_json_path is not None:
             print(f"Deps JSON:  {deps_json_path}")
 
@@ -339,7 +339,7 @@ def run_analysis(  # noqa: PLR0912, PLR0915
     if perf_data is not None:
         data = perf_data
     else:
-        with open(l2_perf_records_path) as f:
+        with open(l2_swimlane_records_path) as f:
             data = json.load(f)
     tasks = data["tasks"]
     n_total = len(tasks)
@@ -573,13 +573,14 @@ def main():
         epilog="""
 Examples:
   %(prog)s                                          # auto-select latest files
-  %(prog)s --l2-perf-records-json outputs/<case>_<ts>/l2_perf_records.json
-  %(prog)s --l2-perf-records-json outputs/<case>_<ts>/l2_perf_records.json --deps-json outputs/<case>_<ts>/deps.json
+  %(prog)s --l2-swimlane-records-json outputs/<case>_<ts>/l2_swimlane_records.json
+  %(prog)s --l2-swimlane-records-json outputs/<case>_<ts>/l2_swimlane_records.json \
+      --deps-json outputs/<case>_<ts>/deps.json
         """,
     )
     parser.add_argument(
-        "--l2-perf-records-json",
-        help="Path to l2_perf_records_*.json file. If not specified, uses the latest in outputs/",
+        "--l2-swimlane-records-json",
+        help="Path to l2_swimlane_records_*.json file. If not specified, uses the latest in outputs/",
     )
     parser.add_argument(
         "--deps-json",
@@ -593,30 +594,32 @@ def main():
 
     # Resolve perf path
     try:
-        l2_perf_records_path = (
-            Path(args.l2_perf_records_json) if args.l2_perf_records_json else auto_select_l2_perf_records_json()
+        l2_swimlane_records_path = (
+            Path(args.l2_swimlane_records_json)
+            if args.l2_swimlane_records_json
+            else auto_select_l2_swimlane_records_json()
         )
     except FileNotFoundError as e:
         print(f"Error: {e}", file=sys.stderr)
         return 1
 
-    if not l2_perf_records_path.exists():
-        print(f"Error: Perf JSON not found: {l2_perf_records_path}", file=sys.stderr)
+    if not l2_swimlane_records_path.exists():
+        print(f"Error: Perf JSON not found: {l2_swimlane_records_path}", file=sys.stderr)
         return 1
 
     # Single load — pass the parsed dict to run_analysis() so it doesn't
     # reread the file (large artifacts hit JSON parsing twice otherwise).
     try:
-        with open(l2_perf_records_path) as _f:
+        with open(l2_swimlane_records_path) as _f:
             perf_data = json.load(_f)
     except (OSError, ValueError) as e:
-        print(f"Error: failed to read perf JSON {l2_perf_records_path}: {e}", file=sys.stderr)
+        print(f"Error: failed to read perf JSON {l2_swimlane_records_path}: {e}", file=sys.stderr)
         return 1
 
     deps_json_path = Path(args.deps_json) if args.deps_json else None
 
     return run_analysis(
-        l2_perf_records_path,
+        l2_swimlane_records_path,
         print_sources=True,
         deps_json_path=deps_json_path,
         perf_data=perf_data,
diff --git a/simpler_setup/tools/swimlane_converter.py b/simpler_setup/tools/swimlane_converter.py
index 1d7fe2103..e2fd941c5 100644
--- a/simpler_setup/tools/swimlane_converter.py
+++ b/simpler_setup/tools/swimlane_converter.py
@@ -14,11 +14,11 @@
 for visualization in Perfetto (https://ui.perfetto.dev/).
 
 Usage:
-    python -m simpler_setup.tools.swimlane_converter  # latest l2_perf_records_*.json under ./outputs/
-    python -m simpler_setup.tools.swimlane_converter l2_perf_records_20260210_143526.json
-    python -m simpler_setup.tools.swimlane_converter l2_perf_records_20260210_143526.json -o out.json
-    python -m simpler_setup.tools.swimlane_converter l2_perf_records_20260210_143526.json -k kernel_config.py
-    python -m simpler_setup.tools.swimlane_converter l2_perf_records_20260210_143526.json -v
+    python -m simpler_setup.tools.swimlane_converter  # latest l2_swimlane_records_*.json under ./outputs/
+    python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json
+    python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json -o out.json
+    python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json -k kernel_config.py
+    python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_swimlane_records.json -v
 """
 
 import argparse
@@ -93,7 +93,7 @@ def read_perf_data(filepath):
 
     Returns:
         dict: Parsed performance data with keys:
-            - l2_perf_level
+            - l2_swimlane_level
             - tasks (list)
 
     Raises:
@@ -102,13 +102,13 @@ def read_perf_data(filepath):
     with open(filepath) as f:
         data = json.load(f)
 
-    required_fields = ["l2_perf_level", "tasks"]
+    required_fields = ["l2_swimlane_level", "tasks"]
     for field in required_fields:
         if field not in data:
             raise ValueError(f"Missing required field: {field}")
 
-    if data["l2_perf_level"] not in [1, 2, 3, 4]:
-        raise ValueError(f"Unsupported l2_perf_level: {data['l2_perf_level']} (expected 1, 2, 3, or 4)")
+    if data["l2_swimlane_level"] not in [1, 2, 3, 4]:
+        raise ValueError(f"Unsupported l2_swimlane_level: {data['l2_swimlane_level']} (expected 1, 2, 3, or 4)")
 
     return data
 
@@ -393,15 +393,15 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         output_path: Path to output JSON file
         func_id_to_name: Optional dict mapping func_id to function name
         verbose: Print progress information
-        scheduler_phases: Optional list of per-thread phase record lists (l2_perf_level >= 3)
-        orchestrator_phases: Optional list of per-task orchestrator phase records (l2_perf_level >= 4)
+        scheduler_phases: Optional list of per-thread phase record lists (l2_swimlane_level >= 3)
+        orchestrator_phases: Optional list of per-task orchestrator phase records (l2_swimlane_level >= 4)
         core_to_thread: Optional list mapping core_id (index) to scheduler thread index (-1 = unassigned)
 
     Generates processes in the trace:
         - pid=1 "AICore View": start_time_us to end_time_us (kernel execution)
         - pid=2 "AICPU View": dispatch_time_us to finish_time_us (AICPU perspective)
-        - pid=3 "AICPU Scheduler": scheduler phase bars (l2_perf_level >= 3)
-        - pid=4 "AICPU Orchestrator": orchestrator phase bars or summary (l2_perf_level >= 4)
+        - pid=3 "AICPU Scheduler": scheduler phase bars (l2_swimlane_level >= 3)
+        - pid=4 "AICPU Orchestrator": orchestrator phase bars or summary (l2_swimlane_level >= 4)
     """
     if verbose:
         print("Generating Chrome Trace JSON...")
@@ -698,7 +698,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         if hb_violation_count > 0:
             print(f"  Happens-before violations: {hb_violation_count} edge(s) flagged as 'hb_violation'")
 
-    # AICPU Scheduler phase events (l2_perf_level >= 3)
+    # AICPU Scheduler phase events (l2_swimlane_level >= 3)
     if scheduler_phases:
         # Process metadata
         events.append(
@@ -764,7 +764,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
                     }
                 )
 
-    # AICPU Orchestrator lane (l2_perf_level >= 4)
+    # AICPU Orchestrator lane (l2_swimlane_level >= 4)
     #
     # Per-event AicpuPhaseRecord[] is the single source of truth for
     # orchestrator timing. There is no separate aggregate summary — the
@@ -1094,17 +1094,18 @@ def _build_parser():
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
-  %(prog)s                                       # Use latest .json in outputs/, output to outputs/
-  %(prog)s l2_perf_records_20260210_143526.json   # Output: outputs/merged_swimlane_20260210_143526.json
-  %(prog)s l2_perf_records_20260210_143526.json -o custom_output.json
-  %(prog)s l2_perf_records_20260210_143526.json -k examples/host_build_graph/paged_attention/kernels/kernel_config.py
-  %(prog)s l2_perf_records_20260210_143526.json -v
+  %(prog)s                                            # Use latest .json in outputs/, output to outputs/
+  %(prog)s outputs/<case>_<ts>/l2_swimlane_records.json   # Output: outputs/merged_swimlane_20260210_143526.json
+  %(prog)s outputs/<case>_<ts>/l2_swimlane_records.json -o custom_output.json
+  %(prog)s outputs/<case>_<ts>/l2_swimlane_records.json \
+      -k examples/host_build_graph/paged_attention/kernels/kernel_config.py
+  %(prog)s outputs/<case>_<ts>/l2_swimlane_records.json -v
         """,
     )
     parser.add_argument(
         "input",
         nargs="?",
-        help="Input JSON file (.json). If not specified, uses the latest l2_perf_records_*.json in outputs/",
+        help="Input JSON file (.json). If not specified, uses the latest l2_swimlane_records_*.json in outputs/",
     )
     parser.add_argument("-o", "--output", help="Output JSON file (default: <input_dir>/merged_swimlane.json)")
     parser.add_argument(
@@ -1128,7 +1129,7 @@ def _build_parser():
 
 
 def _resolve_input_path(args):
-    """Resolve input path, auto-selecting newest outputs/<case>/l2_perf_records.json if unspecified."""
+    """Resolve input path, auto-selecting newest outputs/<case>/l2_swimlane_records.json if unspecified."""
     if args.input is not None:
         input_path = Path(args.input)
         if not input_path.exists():
@@ -1137,9 +1138,9 @@ def _resolve_input_path(args):
         return input_path
 
     outputs_dir = Path.cwd() / "outputs"
-    json_files = list(outputs_dir.glob("*/l2_perf_records.json"))
+    json_files = list(outputs_dir.glob("*/l2_swimlane_records.json"))
     if not json_files:
-        print(f"Error: No outputs/*/l2_perf_records.json found under {outputs_dir}", file=sys.stderr)
+        print(f"Error: No outputs/*/l2_swimlane_records.json found under {outputs_dir}", file=sys.stderr)
         print("Run a test with --enable-l2-swimlane first, or specify an explicit input.", file=sys.stderr)
         return None
 
@@ -1161,11 +1162,11 @@ def _resolve_output_path(args, input_path):
 
 def _print_verbose_data_info(data, verbose):
     """Print verbose summary of loaded performance data, including phase counts
-    when present (l2_perf_level >= SCHED_PHASES)."""
+    when present (l2_swimlane_level >= SCHED_PHASES)."""
     if not verbose:
         return
     print("\n=== Performance Data ===")
-    print(f"  L2 perf level: {data['l2_perf_level']}")
+    print(f"  L2 perf level: {data['l2_swimlane_level']}")
     print(f"  Task Count: {len(data['tasks'])}")
     if data["tasks"]:
         start_times = [t["start_time_us"] for t in data["tasks"]]
diff --git a/src/a2a3/platform/include/aicore/aicore_profiling_state.h b/src/a2a3/platform/include/aicore/aicore_profiling_state.h
index b41a60dbb..7d48c91a7 100644
--- a/src/a2a3/platform/include/aicore/aicore_profiling_state.h
+++ b/src/a2a3/platform/include/aicore/aicore_profiling_state.h
@@ -25,14 +25,14 @@
  *
  * Lifecycle:
  *   1. Host fills `KernelArgs::enable_profiling_flag` and
- *      `KernelArgs::aicore_ring_addr` (points to a per-core `AicoreRotation`
+ *      `KernelArgs::l2_swimlane_aicore_rotation_table` (points to a per-core `L2SwimlaneAicoreRotation`
  *      device-address table). Host allocates the table bytes; AICPU populates
- *      the entries inside `l2_perf_aicpu_init`.
- *   2. AICore kernel entry stashes `&aicore_ring_addr[block_idx]` (the slot
+ *      the entries inside `l2_swimlane_aicpu_init`.
+ *   2. AICore kernel entry stashes `&l2_swimlane_aicore_rotation_table[block_idx]` (the slot
  *      pointer — NOT the dereferenced rotation pointer yet) via
- *      `set_aicore_rotation_slot()`, and calls `set_aicore_profiling_flag()`,
+ *      `set_l2_swimlane_aicore_rotation_slot()`, and calls `set_aicore_profiling_flag()`,
  *      before invoking `aicore_execute`.
- *   3. `get_aicore_rotation()` lazily dereferences the slot the first time
+ *   3. `get_l2_swimlane_aicore_rotation()` lazily dereferences the slot the first time
  *      it is called. Callers must defer the call until AFTER AICPU has
  *      dispatched the first task (so AICPU init has had a chance to populate
  *      the table). The executor handles this by calling it inside the main
@@ -45,7 +45,7 @@
 #include <cstdint>
 
 #include "aicore/aicore.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 
 /**
  * Profiling enable bitmask (umbrella over dump_tensor / l2_swimlane / pmu).
@@ -58,20 +58,20 @@ __aicore__ uint32_t get_aicore_profiling_flag();
 /**
  * Per-core AICore rotation channel.
  *
- * `set_aicore_rotation_slot(slot)` stashes the address of THIS core's slot
- * in the rotation-address table — `&((uint64_t*)k_args->aicore_ring_addr)[block_idx]`.
+ * `set_l2_swimlane_aicore_rotation_slot(slot)` stashes the address of THIS core's slot
+ * in the rotation-address table — `&((uint64_t*)k_args->l2_swimlane_aicore_rotation_table)[block_idx]`.
  * No dereference happens here, because at kernel entry the AICPU side may
  * not yet have populated the table (the host launches both kernels and
  * AICPU's init runs concurrently with AICore's entry).
  *
- * `get_aicore_rotation()` lazily dereferences the stashed slot on first use,
+ * `get_l2_swimlane_aicore_rotation()` lazily dereferences the stashed slot on first use,
  * caches the result, and returns it on subsequent calls. Callers MUST defer
  * the first call until after AICPU has dispatched the first task — by then
  * AICPU's init has completed and the slot holds a valid device address.
  * The executor's main loop honours this by reading the rotation only inside
  * the first-task branch of the dispatch poll.
  */
-__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr);
-__aicore__ __gm__ AicoreRotation *get_aicore_rotation();
+__aicore__ void set_l2_swimlane_aicore_rotation_slot(__gm__ uint64_t *slot_ptr);
+__aicore__ __gm__ L2SwimlaneAicoreRotation *get_l2_swimlane_aicore_rotation();
 
 #endif  // PLATFORM_AICORE_AICORE_PROFILING_STATE_H_
diff --git a/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h b/src/a2a3/platform/include/aicore/l2_swimlane_collector_aicore.h
similarity index 75%
rename from src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h
rename to src/a2a3/platform/include/aicore/l2_swimlane_collector_aicore.h
index c5aaadd0b..c6456dde2 100644
--- a/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h
+++ b/src/a2a3/platform/include/aicore/l2_swimlane_collector_aicore.h
@@ -9,17 +9,17 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * @file l2_perf_collector_aicore.h
+ * @file l2_swimlane_collector_aicore.h
  * @brief AICore performance data collection interface
  *
  * Provides lightweight performance recording interface for AICore kernels.
  * Uses dcci for efficient cache management instead of memory barriers.
  */
 
-#ifndef PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_
-#define PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_
+#ifndef PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_
+#define PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_
 
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "aicore/aicore.h"
 
 // Include platform-specific timestamp implementation
@@ -33,13 +33,13 @@
 
 /**
  * AICore-local rotation state. Tracks which buffer this core is currently
- * writing into and which slot is next. Reset by `l2_perf_aicore_record_task`
- * when it observes a generation bump on the shared `AicoreRotation` channel
+ * writing into and which slot is next. Reset by `l2_swimlane_aicore_record_task`
+ * when it observes a generation bump on the shared `L2SwimlaneAicoreRotation` channel
  * (AICPU rotates by writing `current_buf_ptr` + bumping `generation`, so the
  * AICore-local state self-recovers without any AICore-side spin-wait).
  */
-struct AicoreLocalState {
-    __gm__ L2PerfAicoreBuffer *cached_buf = nullptr;
+struct L2SwimlaneAicoreLocalState {
+    __gm__ L2SwimlaneAicoreTaskBuffer *cached_buf = nullptr;
     // Must start != AICPU's initial generation (1) so the first record_task
     // call observes a generation mismatch and loads the buffer pointer.
     uint32_t cached_generation = 0;
@@ -49,10 +49,10 @@ struct AicoreLocalState {
 /**
  * Record task execution performance data.
  *
- * AICore writes a slim L2PerfAicoreRecord into its currently-published
- * per-core L2PerfAicoreBuffer at `records[slot_within_buf++]`. The
- * publication channel is an AicoreRotation cache line addressed via
- * `KernelArgs::aicore_ring_addr[block_idx]` (now points to AicoreRotation,
+ * AICore writes a slim L2SwimlaneAicoreTaskRecord into its currently-published
+ * per-core L2SwimlaneAicoreTaskBuffer at `records[slot_within_buf++]`. The
+ * publication channel is an L2SwimlaneAicoreRotation cache line addressed via
+ * `KernelArgs::l2_swimlane_aicore_rotation_table[block_idx]` (now points to L2SwimlaneAicoreRotation,
  * not directly to a buffer). AICPU updates `rotation->current_buf_ptr` and
  * bumps `rotation->generation` at dispatch boundaries; AICore detects the
  * change by `dcci`-ing the rotation line per task and comparing generation
@@ -69,22 +69,23 @@ struct AicoreLocalState {
  * so AICore has already finished writing their records before AICPU enqueues
  * the old buffer to the ready queue.
  *
- * @param rotation Per-core AicoreRotation channel (cached at kernel entry
- *                 from KernelArgs::aicore_ring_addr[block_idx])
+ * @param rotation Per-core L2SwimlaneAicoreRotation channel (cached at kernel entry
+ *                 from KernelArgs::l2_swimlane_aicore_rotation_table[block_idx])
  * @param local    Per-core AICore-local state (caller-owned static)
  * @param task_id  Register dispatch id (DATA_MAIN_BASE), low 32 bits
  * @param start_time Start timestamp (get_sys_cnt)
  * @param end_time   End timestamp
  */
-__aicore__ __attribute__((always_inline)) static inline void l2_perf_aicore_record_task(
-    __gm__ AicoreRotation *rotation, AicoreLocalState *local, uint32_t task_id, uint64_t start_time, uint64_t end_time
+__aicore__ __attribute__((always_inline)) static inline void l2_swimlane_aicore_record_task(
+    __gm__ L2SwimlaneAicoreRotation *rotation, L2SwimlaneAicoreLocalState *local, uint32_t task_id, uint64_t start_time,
+    uint64_t end_time
 ) {
     // Re-fetch rotation channel each task; cheap relative to the
     // baseline `dcci(payload, ENTIRE_DATA_CACHE)` we already pay per task.
     dcci(rotation, SINGLE_CACHE_LINE);
     if (rotation->generation != local->cached_generation) {
         local->cached_generation = rotation->generation;
-        local->cached_buf = reinterpret_cast<__gm__ L2PerfAicoreBuffer *>(rotation->current_buf_ptr);
+        local->cached_buf = reinterpret_cast<__gm__ L2SwimlaneAicoreTaskBuffer *>(rotation->current_buf_ptr);
         local->slot_within_buf = 0;
     }
     if (local->cached_buf == nullptr) {
@@ -102,7 +103,7 @@ __aicore__ __attribute__((always_inline)) static inline void l2_perf_aicore_reco
         return;
     }
 
-    __gm__ L2PerfAicoreRecord *record = &local->cached_buf->records[slot];
+    __gm__ L2SwimlaneAicoreTaskRecord *record = &local->cached_buf->records[slot];
     record->start_time = start_time;
     record->end_time = end_time;
     record->task_id = task_id;
@@ -113,4 +114,4 @@ __aicore__ __attribute__((always_inline)) static inline void l2_perf_aicore_reco
     dsb((mem_dsb_t)0);
 }
 
-#endif  // PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_
+#endif  // PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_
diff --git a/src/a2a3/platform/include/aicpu/dep_gen_collector_aicpu.h b/src/a2a3/platform/include/aicpu/dep_gen_collector_aicpu.h
index 12e9a74e6..c2a16a859 100644
--- a/src/a2a3/platform/include/aicpu/dep_gen_collector_aicpu.h
+++ b/src/a2a3/platform/include/aicpu/dep_gen_collector_aicpu.h
@@ -53,7 +53,7 @@ extern "C" bool is_dep_gen_enabled();
  * the per-thread ready_queue when buffers fill or on flush. Must be called by
  * aicpu_executor.cpp before any dep_gen_aicpu_record_submit() can fire.
  *
- * Mirrors l2_perf_aicpu_set_orch_thread_idx().
+ * Mirrors l2_swimlane_aicpu_set_orch_thread_idx().
  */
 void dep_gen_aicpu_set_orch_thread_idx(int thread_idx);
 
diff --git a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
similarity index 66%
rename from src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h
rename to src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
index 03465e02b..ecfb9723e 100644
--- a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h
+++ b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
@@ -9,17 +9,17 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * @file l2_perf_collector_aicpu.h
+ * @file l2_swimlane_collector_aicpu.h
  * @brief AICPU performance data collection interface
  *
  * Provides performance profiling management interface for AICPU side.
  * Handles buffer initialization, switching, and flushing.
  */
 
-#ifndef PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_
-#define PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_
+#ifndef PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_
+#define PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_
 
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 
 // Include platform-specific timestamp implementation
 // Build system selects the correct inner_aicpu.h based on platform:
@@ -29,51 +29,51 @@
 // ============= Public Interface =============
 
 /**
- * L2 perf handshake setters — called by the host (sim) or the AICPU kernel
- * entry (onboard) before `l2_perf_aicpu_init()` so AICPU code can read perf
+ * L2 swimlane handshake setters — called by the host (sim) or the AICPU kernel
+ * entry (onboard) before `l2_swimlane_aicpu_init()` so AICPU code can read perf
  * state without reaching into the generic `Runtime` struct.
  *
  * Two-channel level transport (mirrors the PMU pattern):
  *   - binary on/off — `enable_profiling_flag` bit1 → `set_l2_swimlane_enabled(bool)`
  *     at kernel entry; queried via `is_l2_swimlane_enabled()`.
- *   - granular L2PerfLevel — `L2PerfDataHeader::l2_perf_level`
- *     (shared memory); read in `l2_perf_aicpu_init` and cached, then queried
- *     via `get_l2_perf_level()` for
+ *   - granular L2SwimlaneLevel — `L2SwimlaneDataHeader::l2_swimlane_level`
+ *     (shared memory); read in `l2_swimlane_aicpu_init` and cached, then queried
+ *     via `get_l2_swimlane_level()` for
  *     `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates.
  */
-extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base);
-extern "C" uint64_t get_platform_l2_perf_base();
+extern "C" void set_platform_l2_swimlane_base(uint64_t l2_swimlane_data_base);
+extern "C" uint64_t get_platform_l2_swimlane_base();
 extern "C" void set_l2_swimlane_enabled(bool enable);
 extern "C" bool is_l2_swimlane_enabled();
 
-// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr).
+// AICore rotation-table device pointer (= KernelArgs::l2_swimlane_aicore_rotation_table).
 // Published by the host before AICPU init runs; AICPU init fills the table
-// with the per-core `&L2PerfAicoreBufferState::rotation` device addresses so
-// AICore can index `aicore_ring_addr[block_idx]` to find its rotation channel.
+// with the per-core `&L2SwimlaneAicoreTaskPool::rotation` device addresses so
+// AICore can index `l2_swimlane_aicore_rotation_table[block_idx]` to find its rotation channel.
 // Moved from host into AICPU so the host stays decoupled from the AICore-side
 // shared-memory layout (host previously did host-to-device address translation
 // + reached into get_aicore_buffer_state to fill this).
-extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr);
-extern "C" uint64_t get_platform_aicore_rotation_table();
+extern "C" void set_platform_l2_swimlane_aicore_rotation_table(uint64_t table_addr);
+extern "C" uint64_t get_platform_l2_swimlane_aicore_rotation_table();
 
 // Typed getter for the granular perf_level (promoted from the shared-memory
-// header inside l2_perf_aicpu_init). Gate sites should use this so the
-// comparison RHS is a named L2PerfLevel constant.
-L2PerfLevel get_l2_perf_level();
+// header inside l2_swimlane_aicpu_init). Gate sites should use this so the
+// comparison RHS is a named L2SwimlaneLevel constant.
+L2SwimlaneLevel get_l2_swimlane_level();
 
 /**
  * Initialize performance profiling
  *
  * Sets up the AICPU buffer pool for each core and initializes tracking state.
- * Reads the perf device-base pointer published via `set_platform_l2_perf_base()`.
+ * Reads the perf device-base pointer published via `set_platform_l2_swimlane_base()`.
  *
  * Also primes the per-core AICore rotation channel: pops the initial
- * L2PerfAicoreBuffer from L2PerfAicoreBufferState::free_queue and writes its
- * address into the AicoreRotation channel that AICore polls per task.
+ * L2SwimlaneAicoreTaskBuffer from L2SwimlaneAicoreTaskPool::free_queue and writes its
+ * address into the L2SwimlaneAicoreRotation channel that AICore polls per task.
  *
  * @param worker_count  Number of AICore workers (cores) to initialize
  */
-void l2_perf_aicpu_init(int worker_count);
+void l2_swimlane_aicpu_init(int worker_count);
 
 /**
  * Rotate the AICore buffer for a given core, if needed.
@@ -89,22 +89,22 @@ void l2_perf_aicpu_init(int worker_count);
  * (and AICore has finished writing their records into the old buffer) before
  * the old buffer enters the ready queue.
  *
- * Called regardless of l2_perf_level — internally gates on AICORE_TIMING.
+ * Called regardless of l2_swimlane_level — internally gates on AICORE_TIMING.
  *
  * @param core_id     Core index
  * @param thread_idx  Owning AICPU thread (target ready-queue)
  */
-void l2_perf_aicpu_maybe_rotate_aicore(int core_id, int thread_idx);
+void l2_swimlane_aicpu_maybe_rotate_aicore(int core_id, int thread_idx);
 
 /**
- * Complete a L2PerfRecord with AICPU-side metadata after AICore task completion
+ * Complete a L2SwimlaneAicpuTaskRecord with AICPU-side metadata after AICore task completion
  *
  * AICore-as-producer: AICore writes start/end/task_id directly into the
- * per-core L2PerfAicoreBuffer at `records[reg_task_id % SIZE]`. AICPU does
+ * per-core L2SwimlaneAicoreTaskBuffer at `records[reg_task_id % SIZE]`. AICPU does
  * NOT read that buffer on the hot path — it only writes AICPU-owned fields
  * (task_id, reg_task_id, func_id, core_type, dispatch_time, finish_time)
  * here, leaving start/end as zero. The host post-processor joins the AICore
- * stream into the L2PerfRecord stream by `reg_task_id` at flush time.
+ * stream into the L2SwimlaneAicpuTaskRecord stream by `reg_task_id` at flush time.
  *
  * Per-core counter accounting:
  *   total_record_count++       — every commit attempt (success or failure)
@@ -115,14 +115,14 @@ void l2_perf_aicpu_maybe_rotate_aicore(int core_id, int thread_idx);
  * @param core_id               Core index — used to resolve buffer state and update counters
  * @param thread_idx            Owning AICPU thread (used when rotating records buffer)
  * @param expected_reg_task_id  Register dispatch token (low 32 bits) — written
- *                              into L2PerfRecord.reg_task_id as the join key
+ *                              into L2SwimlaneAicpuTaskRecord.reg_task_id as the join key
  * @param task_id               Task identifier to write (PTO2 encoding or plain id)
  * @param func_id               Kernel function identifier
  * @param core_type             Core type (AIC/AIV)
  * @param dispatch_time         AICPU timestamp when task was dispatched
  * @param finish_time           AICPU timestamp when task completion was observed
  */
-int l2_perf_aicpu_complete_record(
+int l2_swimlane_aicpu_complete_task(
     int core_id, int thread_idx, uint32_t expected_reg_task_id, uint64_t task_id, uint32_t func_id, CoreType core_type,
     uint64_t dispatch_time, uint64_t finish_time
 );
@@ -136,24 +136,24 @@ int l2_perf_aicpu_complete_record(
  * @param cur_thread_cores Array of core IDs managed by this thread
  * @param core_num Number of cores managed by this thread
  */
-void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, int core_num);
+void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int core_num);
 
 /**
  * Initialize AICPU phase profiling
  *
- * Sets up AicpuPhaseHeader and clears per-thread phase record buffers.
- * Must be called once from thread 0 after l2_perf_aicpu_init().
+ * Sets up L2SwimlaneAicpuPhaseHeader and clears per-thread phase record buffers.
+ * Must be called once from thread 0 after l2_swimlane_aicpu_init().
  *
  * @param worker_count       Number of AICore workers (cores) — used to resolve
- *                           the phase region's offset relative to the L2Perf base
+ *                           the phase region's offset relative to the L2Swimlane base
  * @param num_sched_threads  Number of scheduler threads
  */
-void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads);
+void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads);
 
 /**
  * Record a single scheduler phase
  *
- * Appends an AicpuPhaseRecord to the specified thread's buffer.
+ * Appends an L2SwimlaneAicpuPhaseRecord to the specified thread's buffer.
  * Silently drops records when the buffer is full.
  *
  * @param thread_idx Scheduler thread index
@@ -164,12 +164,12 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads);
  * @param tasks_processed Number of tasks processed in this batch (scheduler phases), or
  *                        full PTO2 task_id encoding (ring_id << 32) | local_id (orchestrator
  *                        phases in tensormap_and_ringbuffer)
- * @param extra1, extra2  Phase-specific delta counters (see AicpuPhaseRecord doc).
+ * @param extra1, extra2  Phase-specific delta counters (see L2SwimlaneAicpuPhaseRecord doc).
  *                        SCHED_DISPATCH uses extra1=pop_hit, extra2=pop_miss; other
  *                        phases pass 0.
  */
-void l2_perf_aicpu_record_phase(
-    int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
+void l2_swimlane_aicpu_record_phase(
+    int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
     uint64_t tasks_processed, uint32_t extra1 = 0, uint32_t extra2 = 0
 );
 
@@ -177,22 +177,22 @@ void l2_perf_aicpu_record_phase(
  * Set orchestrator thread index for per-task phase recording
  *
  * Must be called once from the orchestrator thread before any
- * l2_perf_aicpu_record_orch_phase() calls.
+ * l2_swimlane_aicpu_record_orch_phase() calls.
  *
  * @param thread_idx Thread index for the orchestrator (typically num_sched_threads)
  */
-void l2_perf_aicpu_set_orch_thread_idx(int thread_idx);
+void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx);
 
 /**
  * Record one orchestrator submit envelope
  *
- * Appends an AicpuPhaseRecord covering an entire submit_task() / alloc_tensors()
+ * Appends an L2SwimlaneAicpuPhaseRecord covering an entire submit_task() / alloc_tensors()
  * call. Uses the orchestrator's dedicated buffer slot (set via
  * set_orch_thread_idx). Per-sub-step phase records (ORCH_SYNC..ORCH_FANIN)
  * were dropped — the per-step cumulatives (`g_orch_*_cycle`) in the
  * cold-path log carry the breakdown that those records were duplicating.
  *
- * @param phase_id Always AicpuPhaseId::ORCH_SUBMIT. (Param kept for API
+ * @param phase_id Always L2SwimlaneAicpuPhaseId::ORCH_SUBMIT. (Param kept for API
  *                 stability; legacy values are ignored by the host parser.)
  * @param start_time Submit start timestamp
  * @param end_time Submit end timestamp
@@ -200,28 +200,28 @@ void l2_perf_aicpu_set_orch_thread_idx(int thread_idx);
  * @param task_id Task identifier. For tensormap_and_ringbuffer, this is the full PTO2 encoding:
  * (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler swimlanes.
  */
-void l2_perf_aicpu_record_orch_phase(
-    AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
+void l2_swimlane_aicpu_record_orch_phase(
+    L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
 );
 
 /**
  * Write core-to-thread assignment mapping to shared memory.
  *
- * Callers invoke `l2_perf_aicpu_init_core_assignments(total_cores)` once, then
- * `l2_perf_aicpu_write_core_assignments_for_thread(t, ids, n)` for every
+ * Callers invoke `l2_swimlane_aicpu_init_core_assignments(total_cores)` once, then
+ * `l2_swimlane_aicpu_write_core_assignments_for_thread(t, ids, n)` for every
  * scheduler thread.
  */
-void l2_perf_aicpu_init_core_assignments(int total_cores);
-void l2_perf_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num);
+void l2_swimlane_aicpu_init_core_assignments(int total_cores);
+void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num);
 
 /**
  * Flush remaining phase records for a thread
  *
  * Marks the current WRITING phase buffer as READY and enqueues it
- * for host collection. Called at thread exit (analogous to l2_perf_aicpu_flush_buffers).
+ * for host collection. Called at thread exit (analogous to l2_swimlane_aicpu_flush).
  *
  * @param thread_idx Thread index (scheduler thread or orchestrator)
  */
-void l2_perf_aicpu_flush_phase_buffers(int thread_idx);
+void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx);
 
-#endif  // PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_
+#endif  // PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_
diff --git a/src/a2a3/platform/include/common/dep_gen.h b/src/a2a3/platform/include/common/dep_gen.h
index 091fd349a..226bf9f2a 100644
--- a/src/a2a3/platform/include/common/dep_gen.h
+++ b/src/a2a3/platform/include/common/dep_gen.h
@@ -19,7 +19,7 @@
  * sole source of truth for fanout edges; the L2 swimlane hot path no longer
  * carries fanout to keep AICPU off the per-task GM-store critical path.
  *
- * Streaming buffer design mirrors PMU / L2Perf / TensorDump (single source of
+ * Streaming buffer design mirrors PMU / L2Swimlane / TensorDump (single source of
  * algorithmic truth in src/a2a3/platform/include/host/profiling_common/profiler_base.h):
  *
  *   DepGenFreeQueue    — SPSC: Host pushes free DepGenBuffers, AICPU pops them.
@@ -29,7 +29,7 @@
  *
  * Single-instance: the orchestrator is one AICPU thread, so the BufferState
  * array has length 1. Kept array-shaped (vs scalar) for symmetry with PMU /
- * L2Perf and to match ProfilerBase<DepGenModule>::for_each_instance.
+ * L2Swimlane and to match ProfilerBase<DepGenModule>::for_each_instance.
  *
  * Tensor data is captured as opaque 128-byte blobs (`DEP_GEN_TENSOR_SIZE`)
  * matching the runtime Tensor struct size. The AICPU writer
diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h
index aa5422484..c9a50f525 100644
--- a/src/a2a3/platform/include/common/kernel_args.h
+++ b/src/a2a3/platform/include/common/kernel_args.h
@@ -83,17 +83,21 @@ struct KernelArgs {
     __may_used_by_aicore__ Runtime *runtime_args{nullptr};  // Task runtime in device memory
     uint64_t regs{0};                                       // Per-core register base address array (platform-specific)
     uint64_t ffts_base_addr{0};                             // FFTS base address for AICore
-    uint64_t dump_data_base{0};         // Dump shared memory base address; use explicit flags to detect enablement
-    uint64_t l2_perf_data_base{0};      // L2 perf shared memory base address; use explicit flags to detect enablement
+    uint64_t dump_data_base{0};  // Dump shared memory base address; use explicit flags to detect enablement
+    uint64_t l2_swimlane_data_base{
+        0
+    };  // L2 swimlane shared memory base address; use explicit flags to detect enablement
     uint64_t pmu_data_base{0};          // PMU shared memory base address; use explicit flags to detect enablement
     uint64_t pmu_reg_addrs{0};          // Per-core PMU MMIO register base address array (onboard only; 0 on sim)
     uint64_t dep_gen_data_base{0};      // dep_gen shared memory base address; use explicit flags to detect enablement
     uint64_t scope_stats_data_base{0};  // ScopeStatsBuffer shared memory base; 0 when scope_stats is off.
                                         // Allocated by host's ScopeStatsCollector, read+written by AICPU's
                                         // scope_stats_collector via set_platform_scope_stats_base.
-    uint64_t aicore_ring_addr{0};       // Device ptr to a uint64_t[num_aicore] table holding each core's
-                                        // L2PerfAicoreBuffer address. AICore kernel entry indexes by block_idx
-                                        // and forwards into platform set/get state. 0 when L2 swimlane is off.
+    uint64_t l2_swimlane_aicore_rotation_table{
+        0
+    };  // Device ptr to a uint64_t[num_aicore] table holding each core's
+        // L2SwimlaneAicoreTaskBuffer address. AICore kernel entry indexes by block_idx
+        // and forwards into platform set/get state. 0 when L2 swimlane is off.
     uint32_t log_level{1};              // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
     uint32_t log_info_v{5};             // INFO verbosity threshold (0..9); default V5
     uint32_t enable_profiling_flag{0};  // Profiling umbrella bitmask; dump_tensor|l2_swimlane|pmu|dep_gen|scope_stats
diff --git a/src/a2a3/platform/include/common/l2_perf_profiling.h b/src/a2a3/platform/include/common/l2_swimlane_profiling.h
similarity index 68%
rename from src/a2a3/platform/include/common/l2_perf_profiling.h
rename to src/a2a3/platform/include/common/l2_swimlane_profiling.h
index ead322823..68d98dd75 100644
--- a/src/a2a3/platform/include/common/l2_perf_profiling.h
+++ b/src/a2a3/platform/include/common/l2_swimlane_profiling.h
@@ -10,49 +10,49 @@
  */
 
 /**
- * @file l2_perf_profiling.h
+ * @file l2_swimlane_profiling.h
  * @brief Performance profiling data structures
  *
  * Architecture: Fixed header + per-core/thread buffer states + optional phase profiling region
  *
  * Memory layout (shared memory between Host and Device):
  * ┌─────────────────────────────────────────────────────────────┐
- * │ L2PerfDataHeader (fixed header)                               │
+ * │ L2SwimlaneDataHeader (fixed header)                               │
  * │  - ReadyQueue (FIFO, capacity=PLATFORM_PROF_READYQUEUE_SIZE)│
  * │  - Metadata (num_cores, flags)                              │
  * ├─────────────────────────────────────────────────────────────┤
- * │ L2PerfBufferState[0] (Core 0)                                 │
+ * │ L2SwimlaneAicpuTaskPool[0] (Core 0)                                 │
  * │  - free_queue: SPSC queue of available buffer pointers      │
  * │  - current_buf_ptr, current_buf_seq                         │
  * ├─────────────────────────────────────────────────────────────┤
- * │ L2PerfBufferState[1] (Core 1)                                 │
+ * │ L2SwimlaneAicpuTaskPool[1] (Core 1)                                 │
  * ├─────────────────────────────────────────────────────────────┤
  * │ ...                                                         │
  * ├─────────────────────────────────────────────────────────────┤
- * │ L2PerfBufferState[num_cores-1]                                │
+ * │ L2SwimlaneAicpuTaskPool[num_cores-1]                                │
  * ├─────────────────────────────────────────────────────────────┤
- * │ AicpuPhaseHeader (optional, present when phase profiling)   │
+ * │ L2SwimlaneAicpuPhaseHeader (optional, present when phase profiling)   │
  * │  - magic, num_sched_threads, records_per_thread             │
  * │  - core_to_thread mapping                                   │
  * ├─────────────────────────────────────────────────────────────┤
- * │ PhaseBufferState[thread0]                                   │
+ * │ L2SwimlaneAicpuPhasePool[thread0]                                   │
  * │  - free_queue: SPSC queue of available buffer pointers      │
  * │  - current_buf_ptr, current_buf_seq                         │
  * ├─────────────────────────────────────────────────────────────┤
- * │ PhaseBufferState[thread1]                                   │
+ * │ L2SwimlaneAicpuPhasePool[thread1]                                   │
  * ├─────────────────────────────────────────────────────────────┤
  * │ ...                                                         │
  * └─────────────────────────────────────────────────────────────┘
  *
- * Actual L2PerfBuffer / PhaseBuffer are allocated dynamically by Host
+ * Actual L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer are allocated dynamically by Host
  * and pushed into the per-core/thread free_queue.
  *
- * Base size = sizeof(L2PerfDataHeader) + num_cores * sizeof(L2PerfBufferState)
- * With phases = Base + sizeof(AicpuPhaseHeader) + num_threads * sizeof(PhaseBufferState)
+ * Base size = sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool)
+ * With phases = Base + sizeof(L2SwimlaneAicpuPhaseHeader) + num_threads * sizeof(L2SwimlaneAicpuPhasePool)
  */
 
-#ifndef SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_
-#define SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_
+#ifndef SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_
+#define SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_
 
 #include <cstdint>
 #include <vector>
@@ -61,7 +61,7 @@
 #include "common/platform_config.h"
 
 // =============================================================================
-// L2 perf_level — granularity ladder for the L2 swimlane profiler.
+// L2 swimlane_level — granularity ladder for the L2 swimlane profiler.
 //
 // Each level is a strict superset of the previous: higher levels add the data
 // described by their name on top of all lower-level data. Naming describes
@@ -69,12 +69,12 @@
 // naturally — e.g. `if (level >= SCHED_PHASES)` means "this section runs when
 // scheduler phase records are being collected (or any higher tier)".
 //
-// Transported via `L2PerfDataHeader::l2_perf_level` (host → AICPU,
+// Transported via `L2SwimlaneDataHeader::l2_swimlane_level` (host → AICPU,
 // shared memory) and `CallConfig::enable_l2_swimlane` (Python → C). The wire
 // representation stays integer (uint32_t / int32_t) for ABI stability; this
 // enum is the canonical in-code type used for comparisons.
 // =============================================================================
-enum class L2PerfLevel : uint32_t {
+enum class L2SwimlaneLevel : uint32_t {
     DISABLED = 0,       // No collection at all
     AICORE_TIMING = 1,  // AICore per-task start/end timestamps + task record buffer
     AICPU_TIMING = 2,   // + AICPU dispatch/finish timestamps
@@ -83,7 +83,7 @@ enum class L2PerfLevel : uint32_t {
 };
 
 // =============================================================================
-// L2PerfRecord - Single Task Execution Record
+// L2SwimlaneAicpuTaskRecord - Single Task Execution Record
 // =============================================================================
 
 /**
@@ -95,7 +95,7 @@ enum class L2PerfLevel : uint32_t {
  * critical fanin tail. The host swimlane export emits empty fanout
  * fields; `swimlane_converter.py` joins deps.json at post-process time.
  */
-struct L2PerfRecord {
+struct L2SwimlaneAicpuTaskRecord {
     // Timing information (device clock timestamps)
     uint64_t start_time;  // Task start timestamp (get_sys_cnt) — host-filled at flush from AICore buffer
     uint64_t end_time;    // Task end timestamp — host-filled at flush from AICore buffer
@@ -114,34 +114,37 @@ struct L2PerfRecord {
     CoreType core_type;    // Core type (AIC/AIV)
     uint32_t reg_task_id;  // Register dispatch token (monotonic per core).
                            // Used by the host as the join key against
-                           // L2PerfAicoreRecord.task_id, which is what
+                           // L2SwimlaneAicoreTaskRecord.task_id, which is what
                            // AICore writes into the slim record.
 } __attribute__((aligned(64)));
 
-static_assert(sizeof(L2PerfRecord) % 64 == 0, "L2PerfRecord must be 64-byte aligned for optimal cache performance");
+static_assert(
+    sizeof(L2SwimlaneAicpuTaskRecord) % 64 == 0,
+    "L2SwimlaneAicpuTaskRecord must be 64-byte aligned for optimal cache performance"
+);
 
 // =============================================================================
-// L2PerfAicoreRecord - Slim AICore-Only Record (written by AICore, read by Host)
+// L2SwimlaneAicoreTaskRecord - Slim AICore-Only Record (written by AICore, read by Host)
 // =============================================================================
 
 /**
  * Slim per-task record written by AICore directly into its own per-core
  * output buffer (no staging slot, no AICPU read). AICPU never touches this
  * record. The host post-processor joins it against the AICPU-side
- * L2PerfRecord on `task_id` at flush time.
+ * L2SwimlaneAicpuTaskRecord on `task_id` at flush time.
  *
  * Layout: 24B payload + 8B pad → 32B (half a cache line). Two records pack
  * into one cache line so AICore's per-task store is at most a single line
  * commit + dcci.
  */
-struct L2PerfAicoreRecord {
+struct L2SwimlaneAicoreTaskRecord {
     uint64_t start_time;  // Task start timestamp (get_sys_cnt)
     uint64_t end_time;    // Task end timestamp
     uint32_t task_id;     // Register dispatch token (low 32 bits)
     uint32_t _pad;
 } __attribute__((aligned(32)));
 
-static_assert(sizeof(L2PerfAicoreRecord) == 32, "L2PerfAicoreRecord must be 32B");
+static_assert(sizeof(L2SwimlaneAicoreTaskRecord) == 32, "L2SwimlaneAicoreTaskRecord must be 32B");
 
 // =============================================================================
 // TypedBuffer<Record, N> - Templated Fixed-Size Profiling Buffer
@@ -149,13 +152,13 @@ static_assert(sizeof(L2PerfAicoreRecord) == 32, "L2PerfAicoreRecord must be 32B"
 
 /**
  * Generic fixed-capacity profiling buffer: contiguous record array followed
- * by a producer-written count. Layout matches the legacy L2PerfBuffer so the
+ * by a producer-written count. Layout matches the legacy L2SwimlaneAicpuTaskBuffer so the
  * host allocator and the AICPU consumer can treat all concrete instances
  * uniformly.
  *
  * Concrete instantiations live below as `using` aliases.
- *   - L2PerfBuffer        — AICPU-written, rotated, ready-queue tagged is_phase=0
- *   - L2PerfAicoreBuffer  — AICore-written, NOT rotated (sized for the full
+ *   - L2SwimlaneAicpuTaskBuffer        — AICPU-written, rotated, ready-queue tagged kind=AicpuTask
+ *   - L2SwimlaneAicoreTaskBuffer  — AICore-written, NOT rotated (sized for the full
  *                           session), read by host at flush time
  */
 template <typename Record, size_t N>
@@ -164,9 +167,9 @@ struct TypedBuffer {
     volatile uint32_t count;
 } __attribute__((aligned(64)));
 
-using L2PerfBuffer = TypedBuffer<L2PerfRecord, PLATFORM_PROF_BUFFER_SIZE>;
+using L2SwimlaneAicpuTaskBuffer = TypedBuffer<L2SwimlaneAicpuTaskRecord, PLATFORM_PROF_BUFFER_SIZE>;
 
-// AICore buffer is rotated like L2PerfBuffer: a small fixed capacity per
+// AICore buffer is rotated like L2SwimlaneAicpuTaskBuffer: a small fixed capacity per
 // buffer plus a per-core pool, so an arbitrarily long session never wraps.
 // Per-buffer capacity is a power of two so the AICore-local
 // `slot_within_buf` increment lowers to a bitwise AND for boundary checks.
@@ -180,10 +183,10 @@ static_assert(
 // ready-queue capacity formula there can include the AICore pool's worst-case
 // burst depth alongside the AICPU and Phase pools.
 
-using L2PerfAicoreBuffer = TypedBuffer<L2PerfAicoreRecord, PLATFORM_AICORE_BUFFER_SIZE>;
+using L2SwimlaneAicoreTaskBuffer = TypedBuffer<L2SwimlaneAicoreTaskRecord, PLATFORM_AICORE_BUFFER_SIZE>;
 
 // =============================================================================
-// L2PerfFreeQueue - SPSC Lock-Free Queue for Free Buffers
+// L2SwimlaneFreeQueue - SPSC Lock-Free Queue for Free Buffers
 // =============================================================================
 
 /**
@@ -201,17 +204,17 @@ using L2PerfAicoreBuffer = TypedBuffer<L2PerfAicoreRecord, PLATFORM_AICORE_BUFFE
  * - Device pop: rmb() → read tail → read buffer_ptrs[head % COUNT] → rmb() → write head → wmb()
  * - Host push: write buffer_ptrs[tail % COUNT] → wmb() → write tail → wmb()
  */
-struct L2PerfFreeQueue {
+struct L2SwimlaneFreeQueue {
     volatile uint64_t buffer_ptrs[PLATFORM_PROF_SLOT_COUNT];  // Free buffer addresses
     volatile uint32_t head;                                   // Consumer read position (Device increments)
     volatile uint32_t tail;                                   // Producer write position (Host increments)
     uint32_t pad[13];                                         // Pad to 128 bytes (aligned to cache line)
 } __attribute__((aligned(64)));
 
-static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes for cache alignment");
+static_assert(sizeof(L2SwimlaneFreeQueue) == 128, "L2SwimlaneFreeQueue must be 128 bytes for cache alignment");
 
 // =============================================================================
-// L2PerfBufferState - Per-Core/Thread Buffer State (Unified for L2PerfRecord and Phase)
+// L2SwimlaneAicpuTaskPool - Per-Core/Thread Buffer State (Unified for L2SwimlaneAicpuTaskRecord and Phase)
 // =============================================================================
 
 /**
@@ -220,62 +223,46 @@ static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes
  * Contains:
  * - free_queue: SPSC queue of available buffer addresses
  * - current_buf_ptr: Currently active buffer being written (0 = no active
- *   buffer). AICPU writes L2PerfRecord into it; rotated by AICPU when full.
+ *   buffer). AICPU writes records into it; rotated by AICPU when full.
  * - current_buf_seq: Monotonic sequence number for ordering
- * - aicore_ring_ptr: Per-core L2PerfAicoreBuffer device address (L2PerfRecord
- *   profiling only; 0 for Phase). Allocated once by the host, addressed
- *   through this field; AICore reads timing-publication target via
- *   KernelArgs::aicore_ring_addr (a flat uint64_t[num_cores] table the host
- *   builds from this field). Never reassigned during the run. AICPU does
- *   NOT touch it on the hot path post-AICore-as-producer; the host reads
- *   it at flush time and joins the slim AICore records into the L2PerfRecord
- *   stream by reg_task_id.
  * - total_record_count / dropped_record_count: per-core/-thread tallies
  *   AICPU keeps so the host can cross-check `collected + dropped ==
  *   device_total` at end-of-run.
- * - mismatch_record_count: legacy field for the pre-AICore-as-producer
- *   ring-slot mismatch class. No longer written; kept for ABI continuity.
  *
  * Used in two contexts:
- * - Per-core L2PerfRecord profiling (current_buf_ptr → L2PerfBuffer,
- *   aicore_ring_ptr → L2PerfAicoreBuffer)
- * - Per-thread Phase profiling (current_buf_ptr → PhaseBuffer,
- *   aicore_ring_ptr unused)
+ * - Per-core L2SwimlaneAicpuTaskRecord profiling (current_buf_ptr → L2SwimlaneAicpuTaskBuffer)
+ * - Per-thread Phase profiling (current_buf_ptr → L2SwimlaneAicpuPhaseBuffer)
  *
  * Writers:
  * - free_queue.tail: Host writes (pushes new buffers)
  * - free_queue.head: Device writes (pops buffers)
  * - current_buf_ptr: Device writes (after pop), Host reads (for flush/collect)
  * - current_buf_seq: Device writes (monotonic counter)
- * - aicore_ring_ptr: Host writes once at init; AICPU never reads on the
- *   hot path; host reads at flush time to do the AICore-side join.
  * - total_record_count / dropped_record_count: Device writes, Host reads
  *   at drain time (no concurrency on a per-state basis since each state
  *   belongs to a single core/thread).
  */
-struct L2PerfBufferState {
-    L2PerfFreeQueue free_queue;               // SPSC queue of free buffer addresses
-    volatile uint64_t current_buf_ptr;        // Current active L2PerfBuffer (0 = none)
-    volatile uint64_t aicore_ring_ptr;        // Per-core L2PerfAicoreBuffer (L2Perf only; 0 for Phase)
-    volatile uint32_t current_buf_seq;        // Sequence number for ordering
-    volatile uint32_t total_record_count;     // Records the AICPU attempted to write to this state
-    volatile uint32_t dropped_record_count;   // Records dropped (queue full / overwrite / no buffer)
-    volatile uint32_t mismatch_record_count;  // Legacy: ring/task_id mismatches (no longer written, kept for ABI)
-    uint32_t pad[8];                          // Pad to 192 bytes (aligned to cache line)
+struct L2SwimlaneAicpuTaskPool {
+    L2SwimlaneFreeQueue free_queue;          // SPSC queue of free buffer addresses
+    volatile uint64_t current_buf_ptr;       // Current active L2SwimlaneAicpuTaskBuffer (0 = none)
+    volatile uint32_t current_buf_seq;       // Sequence number for ordering
+    volatile uint32_t total_record_count;    // Records the AICPU attempted to write to this state
+    volatile uint32_t dropped_record_count;  // Records dropped (queue full / overwrite / no buffer)
+    uint32_t pad[11];                        // Pad to 192 bytes (aligned to cache line)
 } __attribute__((aligned(64)));
 
-static_assert(sizeof(L2PerfBufferState) == 192, "L2PerfBufferState must be 192 bytes for cache alignment");
+static_assert(sizeof(L2SwimlaneAicpuTaskPool) == 192, "L2SwimlaneAicpuTaskPool must be 192 bytes for cache alignment");
 
 // Type alias for semantic clarity in Phase profiling context
-using PhaseBufferState = L2PerfBufferState;  // Per-thread Phase profiling
+using L2SwimlaneAicpuPhasePool = L2SwimlaneAicpuTaskPool;  // Per-thread Phase profiling
 
 // =============================================================================
-// AicoreRotation - Per-Core AICore Buffer Rotation Channel
+// L2SwimlaneAicoreRotation - Per-Core AICore Buffer Rotation Channel
 // =============================================================================
 
 /**
  * Single cache-line struct AICore reads on every task to decide which
- * L2PerfAicoreBuffer to write into. AICPU updates it when rotating; AICore
+ * L2SwimlaneAicoreTaskBuffer to write into. AICPU updates it when rotating; AICore
  * detects the change via the generation counter and resets its local slot.
  *
  *   Writer: AICPU (host writes initial values at init)
@@ -288,69 +275,73 @@ using PhaseBufferState = L2PerfBufferState;  // Per-thread Phase profiling
  * have FIN'd, so AICore has already finished writing their records into the
  * old buffer before AICPU enqueues it to ready_queue.
  */
-struct AicoreRotation {
-    volatile uint64_t current_buf_ptr;  // Device address of the active L2PerfAicoreBuffer
+struct L2SwimlaneAicoreRotation {
+    volatile uint64_t current_buf_ptr;  // Device address of the active L2SwimlaneAicoreTaskBuffer
     volatile uint32_t generation;       // Bumps on each rotation; AICore compares to detect changes
     uint32_t _pad_a;
     uint32_t _pad_b[12];
 } __attribute__((aligned(64)));
 
-static_assert(sizeof(AicoreRotation) == 64, "AicoreRotation must be one cache line");
+static_assert(sizeof(L2SwimlaneAicoreRotation) == 64, "L2SwimlaneAicoreRotation must be one cache line");
 
 // =============================================================================
-// L2PerfAicoreBufferState - Per-Core AICore Pool State
+// L2SwimlaneAicoreTaskPool - Per-Core AICore Pool State
 // =============================================================================
 
 /**
  * Per-core AICore-side rotation state. Owns:
  *   - rotation: the cache line AICore polls
- *   - free_queue: SPSC queue of recycled L2PerfAicoreBuffer*; host pushes,
+ *   - free_queue: SPSC queue of recycled L2SwimlaneAicoreTaskBuffer*; host pushes,
  *                 AICPU pops when rotating
  *   - total_record_count / dropped_record_count: AICPU-maintained tallies
  *
  * Note that AICore records flow through the existing per-thread ready_queue
- * in L2PerfDataHeader (with ReadyQueueEntry::is_phase=2). This keeps the
- * mgmt-thread drain path uniform with the L2PerfBuffer / PhaseBuffer paths.
+ * in L2SwimlaneDataHeader (with ReadyQueueEntry::kind = AicoreTask). This keeps the
+ * mgmt-thread drain path uniform with the L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer paths.
  */
-struct L2PerfAicoreBufferState {
-    AicoreRotation rotation;                 // 64B — cache-line independent
-    L2PerfFreeQueue free_queue;              // 128B
+struct L2SwimlaneAicoreTaskPool {
+    L2SwimlaneAicoreRotation rotation;       // 64B — cache-line independent
+    L2SwimlaneFreeQueue free_queue;          // 128B
     volatile uint32_t total_record_count;    // AICPU dispatches that should have been recorded
     volatile uint32_t dropped_record_count;  // Buffers dropped (free_queue empty at rotation time)
     volatile uint32_t current_buf_seq;       // Monotonic per-core rotation counter
     uint32_t pad[13];                        // → 256B total
 } __attribute__((aligned(64)));
 
-static_assert(sizeof(L2PerfAicoreBufferState) == 256, "L2PerfAicoreBufferState must be 256 bytes");
+static_assert(sizeof(L2SwimlaneAicoreTaskPool) == 256, "L2SwimlaneAicoreTaskPool must be 256 bytes");
 
 // =============================================================================
 // ReadyQueueEntry - Queue Entry for Ready Buffers
 // =============================================================================
 
+/**
+ * Buffer kind for ReadyQueueEntry::kind. Wire-stable uint32_t underlying so the
+ * struct layout matches the prior `is_phase` field byte-for-byte. The AicpuTask
+ * and Phase values match the historical 0/1; AicoreTask was 2.
+ */
+enum class L2SwimlaneBufferKind : uint32_t {
+    AicpuTask = 0,   // Per-core L2SwimlaneAicpuTaskBuffer, AICPU writes
+    AicpuPhase = 1,  // Per-thread L2SwimlaneAicpuPhaseBuffer, AICPU writes
+    AicoreTask = 2,  // Per-core L2SwimlaneAicoreTaskBuffer, AICore writes, AICPU enqueues at rotation
+};
+
 /**
  * Ready queue entry
  *
- * When a buffer on a core/thread is full, the producer (AICPU for kinds 0/1,
- * AICPU on behalf of AICore for kind 2) pushes this entry. Host memory
- * manager retrieves entries from the queue.
- *
- * Entry kinds (carried in the is_phase field — name is historical, semantic
- * is "buffer kind"):
- *   0: L2PerfBuffer       (per-core,   AICPU writes)
- *   1: PhaseBuffer        (per-thread, AICPU writes)
- *   2: L2PerfAicoreBuffer (per-core,   AICore writes, AICPU enqueues at
- *                          rotation time)
+ * When a buffer on a core/thread is full, the producer (AICPU for
+ * AicpuTask/AicpuPhase, AICPU on behalf of AICore for AicoreTask) pushes this
+ * entry. Host memory manager retrieves entries from the queue.
  */
 struct ReadyQueueEntry {
-    uint32_t core_index;  // Core index (0 ~ num_cores-1), or thread_idx for phase entries
-    uint32_t is_phase;    // Buffer kind: 0=L2PerfBuffer, 1=PhaseBuffer, 2=L2PerfAicoreBuffer
-    uint64_t buffer_ptr;  // Device pointer to the full buffer
-    uint32_t buffer_seq;  // Sequence number for ordering
-    uint32_t pad;         // Alignment padding
+    uint32_t core_index;        // Core index (0 ~ num_cores-1), or thread_idx for phase entries
+    L2SwimlaneBufferKind kind;  // Buffer kind discriminator (uint32_t underlying)
+    uint64_t buffer_ptr;        // Device pointer to the full buffer
+    uint32_t buffer_seq;        // Sequence number for ordering
+    uint32_t pad;               // Alignment padding
 } __attribute__((aligned(32)));
 
 // =============================================================================
-// L2PerfDataHeader - Fixed Header
+// L2SwimlaneDataHeader - Fixed Header
 // =============================================================================
 
 /**
@@ -369,7 +360,7 @@ struct ReadyQueueEntry {
  * - Queue empty: head == tail
  * - Queue full: (tail + 1) % capacity == head
  */
-struct L2PerfDataHeader {
+struct L2SwimlaneDataHeader {
     // Per-thread ready queues (FIFO Circular Buffers)
     // Each AICPU thread has its own queue to avoid lock contention
     ReadyQueueEntry queues[PLATFORM_MAX_AICPU_THREADS][PLATFORM_PROF_READYQUEUE_SIZE];
@@ -377,10 +368,10 @@ struct L2PerfDataHeader {
     volatile uint32_t queue_tails[PLATFORM_MAX_AICPU_THREADS];  // Producer write positions (AICPU modifies)
 
     // Metadata (Host initializes, Device read-only)
-    uint32_t num_cores;      // Actual number of cores launched
-    uint32_t l2_perf_level;  // 0=off, 1=AICore timing, 2=+dispatch/fanout,
-                             // 3=+sched phases, 4=+orch phases. Host writes
-                             // at init; AICPU reads in l2_perf_aicpu_init.
+    uint32_t num_cores;          // Actual number of cores launched
+    uint32_t l2_swimlane_level;  // 0=off, 1=AICore timing, 2=+dispatch/fanout,
+                                 // 3=+sched phases, 4=+orch phases. Host writes
+                                 // at init; AICPU reads in l2_swimlane_aicpu_init.
 } __attribute__((aligned(64)));
 
 // =============================================================================
@@ -417,7 +408,7 @@ struct L2PerfDataHeader {
  *           Old captures may carry them; host parser maps to "unknown"
  *           and tools drop them.
  */
-enum class AicpuPhaseId : uint32_t {
+enum class L2SwimlaneAicpuPhaseId : uint32_t {
     // Scheduler phases (per scheduler loop iter)
     SCHED_COMPLETE = 0,  // Process completed tasks (fanin traversal)
     SCHED_DISPATCH = 1,  // Dispatch ready tasks to idle cores
@@ -437,11 +428,11 @@ enum class AicpuPhaseId : uint32_t {
  *   SCHED_COMPLETE: extras are 0.
  *   Orchestrator phases: extras are 0 (reserved for future per-phase metrics).
  */
-struct AicpuPhaseRecord {
-    uint64_t start_time;    // Phase start timestamp
-    uint64_t end_time;      // Phase end timestamp
-    uint32_t loop_iter;     // Loop iteration number
-    AicpuPhaseId phase_id;  // Phase type
+struct L2SwimlaneAicpuPhaseRecord {
+    uint64_t start_time;              // Phase start timestamp
+    uint64_t end_time;                // Phase end timestamp
+    uint32_t loop_iter;               // Loop iteration number
+    L2SwimlaneAicpuPhaseId phase_id;  // Phase type
     union {
         uint64_t task_id;          // tensormap_and_ringbuffer: full PTO2 encoding
                                    // (ring_id << 32) | local_id for cross-view correlation.
@@ -450,32 +441,25 @@ struct AicpuPhaseRecord {
     uint32_t extra1;  // Phase-specific delta (e.g. SCHED_DISPATCH = pop_hit)
     uint32_t extra2;  // Phase-specific delta (e.g. SCHED_DISPATCH = pop_miss)
 };
-static_assert(sizeof(AicpuPhaseRecord) == 40, "AicpuPhaseRecord layout drift");
+static_assert(sizeof(L2SwimlaneAicpuPhaseRecord) == 40, "L2SwimlaneAicpuPhaseRecord layout drift");
 
-constexpr uint32_t AICPU_PHASE_MAGIC = 0x41435048;        // "ACPH"
-constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 16384;  // ~512KB per thread
+constexpr uint32_t L2_SWIMLANE_AICPU_PHASE_MAGIC = 0x41435048;  // "ACPH"
+constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 16384;        // ~512KB per thread
 
-/**
- * Fixed-size phase record buffer (analogous to L2PerfBuffer)
- *
- * Capacity: PLATFORM_PHASE_RECORDS_PER_THREAD
- * Allocated dynamically by Host, pushed into per-thread free_queue.
- */
-struct PhaseBuffer {
-    AicpuPhaseRecord records[PLATFORM_PHASE_RECORDS_PER_THREAD];
-    volatile uint32_t count;
-} __attribute__((aligned(64)));
+// Fixed-size phase record buffer. Same TypedBuffer template as L2SwimlaneAicpuTaskBuffer
+// and L2SwimlaneAicoreTaskBuffer — keeps the drain machinery uniform.
+using L2SwimlaneAicpuPhaseBuffer = TypedBuffer<L2SwimlaneAicpuPhaseRecord, PLATFORM_PHASE_RECORDS_PER_THREAD>;
 
 /**
  * AICPU phase profiling header
  *
- * Located after the L2PerfBufferState array in shared memory.
+ * Located after the L2SwimlaneAicpuTaskPool array in shared memory.
  * Contains metadata and per-thread tracking.
  */
-struct AicpuPhaseHeader {
-    uint32_t magic;                             // Validation magic (AICPU_PHASE_MAGIC)
+struct L2SwimlaneAicpuPhaseHeader {
+    uint32_t magic;                             // Validation magic (L2_SWIMLANE_AICPU_PHASE_MAGIC)
     uint32_t num_sched_threads;                 // Number of scheduler threads
-    uint32_t records_per_thread;                // Max records per PhaseBuffer
+    uint32_t records_per_thread;                // Max records per L2SwimlaneAicpuPhaseBuffer
     uint32_t num_cores;                         // Total number of cores with valid assignments
     int8_t core_to_thread[PLATFORM_MAX_CORES];  // core_id → scheduler thread index (-1 = unassigned)
 } __attribute__((aligned(64)));
@@ -492,41 +476,45 @@ extern "C" {
  * Calculate total memory size for performance data (buffer states only, no buffers)
  *
  * Formula: Total size = Fixed header + Dynamic tail
- *                     = sizeof(L2PerfDataHeader) + num_cores × sizeof(L2PerfBufferState)
+ *                     = sizeof(L2SwimlaneDataHeader) + num_cores × sizeof(L2SwimlaneAicpuTaskPool)
  *
  * @param num_cores Number of cores (block_dim × PLATFORM_CORES_PER_BLOCKDIM)
  * @return Total bytes for header + buffer states
  */
 inline size_t calc_perf_data_size(int num_cores) {
-    return sizeof(L2PerfDataHeader) + num_cores * sizeof(L2PerfBufferState);
+    return sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool);
 }
 
 /**
  * Get header pointer
  *
  * @param base_ptr Shared memory base address (device_ptr or host_ptr)
- * @return L2PerfDataHeader pointer
+ * @return L2SwimlaneDataHeader pointer
  */
-inline L2PerfDataHeader *get_l2_perf_header(void *base_ptr) { return reinterpret_cast<L2PerfDataHeader *>(base_ptr); }
+inline L2SwimlaneDataHeader *get_l2_swimlane_header(void *base_ptr) {
+    return reinterpret_cast<L2SwimlaneDataHeader *>(base_ptr);
+}
 
 /**
- * Get L2PerfBufferState array start address
+ * Get L2SwimlaneAicpuTaskPool array start address
  *
  * @param base_ptr Shared memory base address
- * @return L2PerfBufferState array pointer
+ * @return L2SwimlaneAicpuTaskPool array pointer
  */
-inline L2PerfBufferState *get_perf_buffer_states(void *base_ptr) {
-    return reinterpret_cast<L2PerfBufferState *>(reinterpret_cast<char *>(base_ptr) + sizeof(L2PerfDataHeader));
+inline L2SwimlaneAicpuTaskPool *get_perf_buffer_states(void *base_ptr) {
+    return reinterpret_cast<L2SwimlaneAicpuTaskPool *>(
+        reinterpret_cast<char *>(base_ptr) + sizeof(L2SwimlaneDataHeader)
+    );
 }
 
 /**
- * Get L2PerfBufferState for specified core
+ * Get L2SwimlaneAicpuTaskPool for specified core
  *
  * @param base_ptr Shared memory base address
  * @param core_index Core index (0 ~ num_cores-1)
- * @return L2PerfBufferState pointer
+ * @return L2SwimlaneAicpuTaskPool pointer
  */
-inline L2PerfBufferState *get_perf_buffer_state(void *base_ptr, int core_index) {
+inline L2SwimlaneAicpuTaskPool *get_perf_buffer_state(void *base_ptr, int core_index) {
     return &get_perf_buffer_states(base_ptr)[core_index];
 }
 
@@ -534,55 +522,55 @@ inline L2PerfBufferState *get_perf_buffer_state(void *base_ptr, int core_index)
  * Calculate total memory size including AICore states and phase profiling
  * region (buffer states only, not the record payloads themselves).
  *
- * Layout (after the fixed L2PerfDataHeader):
- *   [L2PerfBufferState × num_cores]
- *   [L2PerfAicoreBufferState × num_cores]
- *   [AicpuPhaseHeader]
- *   [PhaseBufferState × num_sched_threads]
+ * Layout (after the fixed L2SwimlaneDataHeader):
+ *   [L2SwimlaneAicpuTaskPool × num_cores]
+ *   [L2SwimlaneAicoreTaskPool × num_cores]
+ *   [L2SwimlaneAicpuPhaseHeader]
+ *   [L2SwimlaneAicpuPhasePool × num_sched_threads]
  *
  * @param num_cores Number of AICore instances
  * @param num_sched_threads Number of phase profiling threads (scheduler + orchestrator)
  * @return Total bytes needed for header + all buffer states
  */
 inline size_t calc_perf_data_size_with_phases(int num_cores, int num_sched_threads) {
-    return calc_perf_data_size(num_cores) + num_cores * sizeof(L2PerfAicoreBufferState) + sizeof(AicpuPhaseHeader) +
-           num_sched_threads * sizeof(PhaseBufferState);
+    return calc_perf_data_size(num_cores) + num_cores * sizeof(L2SwimlaneAicoreTaskPool) +
+           sizeof(L2SwimlaneAicpuPhaseHeader) + num_sched_threads * sizeof(L2SwimlaneAicpuPhasePool);
 }
 
 /**
- * Get L2PerfAicoreBufferState array start address (located immediately
- * after the L2PerfBufferState array, before the AicpuPhaseHeader).
+ * Get L2SwimlaneAicoreTaskPool array start address (located immediately
+ * after the L2SwimlaneAicpuTaskPool array, before the L2SwimlaneAicpuPhaseHeader).
  */
-inline L2PerfAicoreBufferState *get_aicore_buffer_states(void *base_ptr, int num_cores) {
-    return reinterpret_cast<L2PerfAicoreBufferState *>(
+inline L2SwimlaneAicoreTaskPool *get_aicore_buffer_states(void *base_ptr, int num_cores) {
+    return reinterpret_cast<L2SwimlaneAicoreTaskPool *>(
         reinterpret_cast<char *>(base_ptr) + calc_perf_data_size(num_cores)
     );
 }
 
-inline L2PerfAicoreBufferState *get_aicore_buffer_state(void *base_ptr, int num_cores, int core_index) {
+inline L2SwimlaneAicoreTaskPool *get_aicore_buffer_state(void *base_ptr, int num_cores, int core_index) {
     return &get_aicore_buffer_states(base_ptr, num_cores)[core_index];
 }
 
 /**
- * Get AicpuPhaseHeader pointer (located after the L2PerfAicoreBufferState array).
+ * Get L2SwimlaneAicpuPhaseHeader pointer (located after the L2SwimlaneAicoreTaskPool array).
  */
-inline AicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) {
-    return reinterpret_cast<AicpuPhaseHeader *>(
+inline L2SwimlaneAicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) {
+    return reinterpret_cast<L2SwimlaneAicpuPhaseHeader *>(
         reinterpret_cast<char *>(base_ptr) + calc_perf_data_size(num_cores) +
-        num_cores * sizeof(L2PerfAicoreBufferState)
+        num_cores * sizeof(L2SwimlaneAicoreTaskPool)
     );
 }
 
 /**
- * Get PhaseBufferState array start address (located after AicpuPhaseHeader)
+ * Get L2SwimlaneAicpuPhasePool array start address (located after L2SwimlaneAicpuPhaseHeader)
  */
-inline PhaseBufferState *get_phase_buffer_states(void *base_ptr, int num_cores) {
-    return reinterpret_cast<PhaseBufferState *>(
-        reinterpret_cast<char *>(get_phase_header(base_ptr, num_cores)) + sizeof(AicpuPhaseHeader)
+inline L2SwimlaneAicpuPhasePool *get_phase_buffer_states(void *base_ptr, int num_cores) {
+    return reinterpret_cast<L2SwimlaneAicpuPhasePool *>(
+        reinterpret_cast<char *>(get_phase_header(base_ptr, num_cores)) + sizeof(L2SwimlaneAicpuPhaseHeader)
     );
 }
 
-inline PhaseBufferState *get_phase_buffer_state(void *base_ptr, int num_cores, int thread_idx) {
+inline L2SwimlaneAicpuPhasePool *get_phase_buffer_state(void *base_ptr, int num_cores, int thread_idx) {
     return &get_phase_buffer_states(base_ptr, num_cores)[thread_idx];
 }
 
@@ -590,4 +578,4 @@ inline PhaseBufferState *get_phase_buffer_state(void *base_ptr, int num_cores, i
 }
 #endif
 
-#endif  // SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_
+#endif  // SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_
diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h
index a514503a2..e070396b1 100644
--- a/src/a2a3/platform/include/common/platform_config.h
+++ b/src/a2a3/platform/include/common/platform_config.h
@@ -104,7 +104,7 @@ constexpr int PLATFORM_MAX_CORES = PLATFORM_MAX_BLOCKDIM * PLATFORM_CORES_PER_BL
 
 /**
  * Performance buffer capacity per buffer
- * Number of L2PerfRecord entries per dynamically allocated L2PerfBuffer
+ * Number of L2SwimlaneAicpuTaskRecord entries per dynamically allocated L2SwimlaneAicpuTaskBuffer
  */
 constexpr int PLATFORM_PROF_BUFFER_SIZE = 1000;
 
@@ -118,13 +118,13 @@ constexpr int PLATFORM_PROF_BUFFER_SIZE = 1000;
 constexpr int PLATFORM_PROF_SLOT_COUNT = 4;
 
 /**
- * L2PerfBuffer pre-allocation count per AICore.
+ * L2SwimlaneAicpuTaskBuffer pre-allocation count per AICore.
  * 1 goes into the free_queue at init, the rest into the recycled pool.
  */
 constexpr int PLATFORM_PROF_BUFFERS_PER_CORE = 8;
 
 /**
- * L2PerfAicoreBuffer pre-allocation count per AICore (AICore-as-producer pool).
+ * L2SwimlaneAicoreTaskBuffer pre-allocation count per AICore (AICore-as-producer pool).
  * 1 goes into the free_queue at init, the rest into the recycled pool.
  * Mirrors PLATFORM_PROF_BUFFERS_PER_CORE in role; smaller because AICore records
  * are slim (32 B each) and the buffer is also smaller per the rotation design.
@@ -132,7 +132,7 @@ constexpr int PLATFORM_PROF_BUFFERS_PER_CORE = 8;
 constexpr int PLATFORM_AICORE_BUFFERS_PER_CORE = 4;
 
 /**
- * PhaseBuffer pre-allocation count per AICPU thread.
+ * L2SwimlaneAicpuPhaseBuffer pre-allocation count per AICPU thread.
  * 1 goes into the free_queue at init, the rest into the recycled pool.
  */
 constexpr int PLATFORM_PROF_BUFFERS_PER_THREAD = 16;
@@ -141,8 +141,8 @@ constexpr int PLATFORM_PROF_BUFFERS_PER_THREAD = 16;
  * Ready queue capacity for performance data collection.
  * Queue holds ReadyQueueEntry structs for buffers ready to be read by Host.
  * Sized to match pre-allocation total across all cores and threads, summed
- * over the three buffer kinds (AICPU L2PerfBuffer, PhaseBuffer,
- * AICore L2PerfAicoreBuffer).
+ * over the three buffer kinds (AICPU L2SwimlaneAicpuTaskBuffer, L2SwimlaneAicpuPhaseBuffer,
+ * AICore L2SwimlaneAicoreTaskBuffer).
  */
 constexpr int PLATFORM_PROF_READYQUEUE_SIZE = PLATFORM_MAX_CORES * PLATFORM_PROF_BUFFERS_PER_CORE +
                                               PLATFORM_MAX_AICPU_THREADS * PLATFORM_PROF_BUFFERS_PER_THREAD +
diff --git a/src/a2a3/platform/include/common/pmu_profiling.h b/src/a2a3/platform/include/common/pmu_profiling.h
index 846e87b6a..9c6def972 100644
--- a/src/a2a3/platform/include/common/pmu_profiling.h
+++ b/src/a2a3/platform/include/common/pmu_profiling.h
@@ -17,7 +17,7 @@
  * Software License 2.0). Register offsets live in platform_config.h and are
  * accessed via RegId / reg_index().
  *
- * Streaming buffer design (mirrors l2_perf_profiling.h):
+ * Streaming buffer design (mirrors l2_swimlane_profiling.h):
  *   PmuFreeQueue    — SPSC queue: Host pushes free PmuBuffers, AICPU pops them.
  *   PmuBufferState  — Per-core state: current active buffer pointer + free_queue.
  *   PmuDataHeader   — Fixed shared-memory header: per-thread ready queues.
@@ -54,7 +54,7 @@ constexpr uint32_t PMU_EVENT_TYPE_DEFAULT = static_cast<uint32_t>(PmuEventType::
 
 /**
  * Event ID table for a single event type.
- * `event_ids[i]` programs PMU_CNTi_IDX; `counters[i]` in the L2PerfRecord is the
+ * `event_ids[i]` programs PMU_CNTi_IDX; `pmu_counters[i]` in the PmuRecord is the
  * value of PMU_CNTi after the task completes.
  * `counter_names[i]` is the human-readable CSV column name for counter i.
  * Empty string ("") marks an unused slot.
@@ -134,7 +134,7 @@ inline const PmuEventConfig *pmu_resolve_event_config_a2a3(PmuEventType event_ty
  * Per-task PMU snapshot written by AICPU after each AICore task FIN.
  */
 struct PmuRecord {
-    uint64_t task_id;                               // Same encoding as L2PerfRecord.task_id
+    uint64_t task_id;                               // Same encoding as L2SwimlaneAicpuTaskRecord.task_id
     uint32_t func_id;                               // Kernel function identifier
     CoreType core_type;                             // AIC or AIV
     uint64_t pmu_total_cycles;                      // PMU_CNT_TOTAL (64-bit combined)
@@ -142,7 +142,7 @@ struct PmuRecord {
 } __attribute__((aligned(64)));
 
 // =============================================================================
-// PMU Streaming Buffer Structures (mirrors l2_perf_profiling.h)
+// PMU Streaming Buffer Structures (mirrors l2_swimlane_profiling.h)
 // =============================================================================
 
 /**
diff --git a/src/a2a3/platform/include/common/scope_stats.h b/src/a2a3/platform/include/common/scope_stats.h
index 88efa72dd..844e34089 100644
--- a/src/a2a3/platform/include/common/scope_stats.h
+++ b/src/a2a3/platform/include/common/scope_stats.h
@@ -17,7 +17,7 @@
  * scope_end — each carrying the task/heap ring start/end and the tensormap
  * live-entry count sampled at that boundary, tagged with a phase flag. Records
  * stream off the device in
- * fixed-capacity buffers, mirroring PMU / dep_gen / tensor_dump / l2_perf (the
+ * fixed-capacity buffers, mirroring PMU / dep_gen / tensor_dump / l2_swimlane (the
  * single source of mgmt-loop truth is
  * src/a2a3/platform/include/host/profiling_common/profiler_base.h):
  *
diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h
index 48afba1cf..afbebcb27 100644
--- a/src/a2a3/platform/include/common/tensor_dump.h
+++ b/src/a2a3/platform/include/common/tensor_dump.h
@@ -139,7 +139,7 @@ struct DumpMetaBuffer {
 
 /**
  * Single Producer Single Consumer (SPSC) lock-free queue.
- * Same layout and semantics as L2PerfFreeQueue, separate type for decoupling.
+ * Same layout and semantics as L2SwimlaneFreeQueue, separate type for decoupling.
  *
  * Producer: Host (DumpMemoryManager thread) pushes recycled/new buffers
  * Consumer: Device (AICPU thread) pops buffers when switching
diff --git a/src/a2a3/platform/include/host/dep_gen_collector.h b/src/a2a3/platform/include/host/dep_gen_collector.h
index 5c48723df..ae036683c 100644
--- a/src/a2a3/platform/include/host/dep_gen_collector.h
+++ b/src/a2a3/platform/include/host/dep_gen_collector.h
@@ -264,7 +264,7 @@ class DepGenCollector : public profiling_common::ProfilerBase<DepGenCollector, D
 inline std::string make_deps_json_path(const std::string &output_dir) {
     // Use std::filesystem::path's operator/ for join — robust against trailing
     // slashes or path quirks that bare string concat would silently pass
-    // through. The sibling make_pmu_csv_path / make_l2_perf_path still use
+    // through. The sibling make_pmu_csv_path / make_l2_swimlane_path still use
     // string concat; converting those is a follow-up cleanup since the
     // project's output_prefix paths come from scene_test.py's pathlib join
     // (never trailing-slashed in practice).
diff --git a/src/a2a3/platform/include/host/l2_perf_collector.h b/src/a2a3/platform/include/host/l2_swimlane_collector.h
similarity index 62%
rename from src/a2a3/platform/include/host/l2_perf_collector.h
rename to src/a2a3/platform/include/host/l2_swimlane_collector.h
index a44670d64..a86dfb7a5 100644
--- a/src/a2a3/platform/include/host/l2_perf_collector.h
+++ b/src/a2a3/platform/include/host/l2_swimlane_collector.h
@@ -10,21 +10,21 @@
  */
 
 /**
- * @file l2_perf_collector.h
+ * @file l2_swimlane_collector.h
  * @brief Platform-agnostic performance data collector with dynamic memory management.
  *
  * Architecture:
- * - BufferPoolManager<L2PerfModule>: shared mgmt-thread infrastructure that polls
+ * - BufferPoolManager<L2SwimlaneModule>: shared mgmt-thread infrastructure that polls
  *   the AICPU ready queue, replenishes per-core / per-thread free queues, and
  *   hands full buffers off to the collector thread.
- * - L2PerfCollector: main thread copies records from the manager's ready queue
+ * - L2SwimlaneCollector: main thread copies records from the manager's ready queue
  *   into host vectors and exports the swimlane visualization.
  *
  * Memory operations are injected through callbacks for sim/onboard portability.
  */
 
-#ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_
-#define SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_
+#ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_
+#define SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_
 
 #include <atomic>
 #include <cstdint>
@@ -33,33 +33,33 @@
 #include <thread>
 #include <vector>
 
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 #include "host/profiling_common/profiler_base.h"
 
 // ---------------------------------------------------------------------------
-// L2 Perf profiling Module (drives BufferPoolManager<L2PerfModule>)
+// L2 Perf profiling Module (drives BufferPoolManager<L2SwimlaneModule>)
 // ---------------------------------------------------------------------------
 
 /**
  * L2 Perf has two distinct buffer kinds going through one ready queue per
  * AICPU thread:
- *   - kind 0: per-core L2PerfBuffer (task records)
- *   - kind 1: per-thread PhaseBuffer (scheduler/orchestrator phase records)
- * The ReadyQueueEntry::is_phase flag picks between them.
+ *   - kind 0: per-core L2SwimlaneAicpuTaskBuffer (task records)
+ *   - kind 1: per-thread L2SwimlaneAicpuPhaseBuffer (scheduler/orchestrator phase records)
+ * The ReadyQueueEntry::kind flag picks between them.
  */
 
 /**
  * Buffer kind discriminator carried in ReadyBufferInfo and used to index the
  * per-kind recycled pool inside BufferPoolManager.
- *   PERF_RECORD: per-core AICPU-written L2PerfBuffer
- *   PHASE:       per-thread AICPU-written PhaseBuffer
- *   AICORE:      per-core AICore-written L2PerfAicoreBuffer (rotation driven
+ *   PERF_RECORD: per-core AICPU-written L2SwimlaneAicpuTaskBuffer
+ *   PHASE:       per-thread AICPU-written L2SwimlaneAicpuPhaseBuffer
+ *   AICORE:      per-core AICore-written L2SwimlaneAicoreTaskBuffer (rotation driven
  *                by AICPU at dispatch boundaries)
  */
-enum class ProfBufferType { PERF_RECORD = 0, PHASE = 1, AICORE = 2 };
+enum class ProfBufferType { AICPU_TASK = 0, AICPU_PHASE = 1, AICORE_TASK = 2 };
 
 /**
  * Information about a ready (full) buffer, passed from mgmt thread to main thread.
@@ -73,16 +73,16 @@ struct ReadyBufferInfo {
     uint32_t buffer_seq;    // Sequence number for ordering
 };
 
-struct L2PerfModule {
-    using DataHeader = L2PerfDataHeader;
+struct L2SwimlaneModule {
+    using DataHeader = L2SwimlaneDataHeader;
     using ReadyEntry = ReadyQueueEntry;
     using ReadyBufferInfo = ::ReadyBufferInfo;
-    using FreeQueue = L2PerfFreeQueue;  // PhaseBufferState aliases L2PerfBufferState
+    using FreeQueue = L2SwimlaneFreeQueue;  // L2SwimlaneAicpuPhasePool aliases L2SwimlaneAicpuTaskPool
 
     static constexpr int kBufferKinds = 3;  // 0=PERF_RECORD, 1=PHASE, 2=AICORE
     static constexpr uint32_t kReadyQueueSize = PLATFORM_PROF_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_PROF_SLOT_COUNT;
-    static constexpr const char *kSubsystemName = "L2PerfModule";
+    static constexpr const char *kSubsystemName = "L2SwimlaneModule";
 
     /**
      * batch_size for proactive_replenish's alloc fallback. Sized so that a
@@ -99,31 +99,34 @@ struct L2PerfModule {
 
     static int kind_of(const ReadyBufferInfo &info) { return static_cast<int>(info.type); }
 
-    static DataHeader *header_from_shm(void *shm) { return get_l2_perf_header(shm); }
+    static DataHeader *header_from_shm(void *shm) { return get_l2_swimlane_header(shm); }
 
     /**
-     * Branch on entry.is_phase (kind discriminator 0/1/2) to pick the
-     * per-core perf state vs. the per-thread phase state vs. the per-core
-     * AICore state. Returns nullopt for out-of-range indices.
+     * Branch on entry.kind to pick the per-core perf state vs. the per-thread
+     * phase state vs. the per-core AICore state. Returns nullopt for
+     * out-of-range indices.
      */
-    static std::optional<profiling_common::EntrySite<L2PerfModule>>
+    static std::optional<profiling_common::EntrySite<L2SwimlaneModule>>
     resolve_entry(void *shm, DataHeader *header, int /*q*/, const ReadyEntry &entry) {
         const int num_cores = static_cast<int>(header->num_cores);
-        const uint32_t kind = entry.is_phase;
+        const L2SwimlaneBufferKind kind = entry.kind;
 
-        if (kind == 1) {
+        if (kind == L2SwimlaneBufferKind::AicpuPhase) {
             if (entry.core_index >= static_cast<uint32_t>(PLATFORM_MAX_AICPU_THREADS)) {
-                LOG_ERROR("L2PerfModule: invalid phase entry: thread=%u", entry.core_index);
+                LOG_ERROR("L2SwimlaneModule: invalid phase entry: thread=%u", entry.core_index);
                 return std::nullopt;
             }
         } else {
             if (entry.core_index >= static_cast<uint32_t>(num_cores)) {
-                LOG_ERROR("L2PerfModule: invalid perf entry: core=%u kind=%u", entry.core_index, kind);
+                LOG_ERROR(
+                    "L2SwimlaneModule: invalid perf entry: core=%u kind=%u", entry.core_index,
+                    static_cast<uint32_t>(kind)
+                );
                 return std::nullopt;
             }
         }
 
-        profiling_common::EntrySite<L2PerfModule> site;
+        profiling_common::EntrySite<L2SwimlaneModule> site;
         site.kind = static_cast<int>(kind);
         site.info.index = entry.core_index;
         site.info.slot_idx = 0;
@@ -131,22 +134,23 @@ struct L2PerfModule {
         site.info.host_buffer_ptr = nullptr;  // filled by ProfilerAlgorithms
         site.info.buffer_seq = entry.buffer_seq;
 
-        if (kind == 0) {
-            L2PerfBufferState *state = get_perf_buffer_state(shm, static_cast<int>(entry.core_index));
+        if (kind == L2SwimlaneBufferKind::AicpuTask) {
+            L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm, static_cast<int>(entry.core_index));
             site.free_queue = &state->free_queue;
-            site.buffer_size = sizeof(L2PerfBuffer);
-            site.info.type = ProfBufferType::PERF_RECORD;
-        } else if (kind == 1) {
-            PhaseBufferState *state = get_phase_buffer_state(shm, num_cores, static_cast<int>(entry.core_index));
+            site.buffer_size = sizeof(L2SwimlaneAicpuTaskBuffer);
+            site.info.type = ProfBufferType::AICPU_TASK;
+        } else if (kind == L2SwimlaneBufferKind::AicpuPhase) {
+            L2SwimlaneAicpuPhasePool *state =
+                get_phase_buffer_state(shm, num_cores, static_cast<int>(entry.core_index));
             site.free_queue = &state->free_queue;
-            site.buffer_size = sizeof(PhaseBuffer);
-            site.info.type = ProfBufferType::PHASE;
-        } else {  // kind == 2 (AICORE)
-            L2PerfAicoreBufferState *ac_state =
+            site.buffer_size = sizeof(L2SwimlaneAicpuPhaseBuffer);
+            site.info.type = ProfBufferType::AICPU_PHASE;
+        } else {  // L2SwimlaneBufferKind::AicoreTask
+            L2SwimlaneAicoreTaskPool *ac_state =
                 get_aicore_buffer_state(shm, num_cores, static_cast<int>(entry.core_index));
             site.free_queue = &ac_state->free_queue;
-            site.buffer_size = sizeof(L2PerfAicoreBuffer);
-            site.info.type = ProfBufferType::AICORE;
+            site.buffer_size = sizeof(L2SwimlaneAicoreTaskBuffer);
+            site.info.type = ProfBufferType::AICORE_TASK;
         }
         return site;
     }
@@ -157,23 +161,24 @@ struct L2PerfModule {
 
         // Per-core perf states (kind 0)
         for (int i = 0; i < num_cores; i++) {
-            L2PerfBufferState *state = get_perf_buffer_state(shm, i);
-            cb(/*kind=*/0, &state->free_queue, sizeof(L2PerfBuffer));
+            L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm, i);
+            cb(/*kind=*/0, &state->free_queue, sizeof(L2SwimlaneAicpuTaskBuffer));
         }
 
         // Per-core AICore states (kind 2)
         for (int i = 0; i < num_cores; i++) {
-            L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(shm, num_cores, i);
-            cb(/*kind=*/2, &ac_state->free_queue, sizeof(L2PerfAicoreBuffer));
+            L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(shm, num_cores, i);
+            cb(/*kind=*/2, &ac_state->free_queue, sizeof(L2SwimlaneAicoreTaskBuffer));
         }
 
-        // Per-thread phase states (kind 1) — gated on AicpuPhaseHeader being
+        // Per-thread phase states (kind 1) — gated on L2SwimlaneAicpuPhaseHeader being
         // initialized (runtimes that don't emit phase records leave it zero).
-        AicpuPhaseHeader *ph = get_phase_header(shm, num_cores);
-        const int num_phase_threads = (ph->magic == AICPU_PHASE_MAGIC) ? static_cast<int>(ph->num_sched_threads) : 0;
+        L2SwimlaneAicpuPhaseHeader *ph = get_phase_header(shm, num_cores);
+        const int num_phase_threads =
+            (ph->magic == L2_SWIMLANE_AICPU_PHASE_MAGIC) ? static_cast<int>(ph->num_sched_threads) : 0;
         for (int t = 0; t < num_phase_threads; t++) {
-            PhaseBufferState *state = get_phase_buffer_state(shm, num_cores, t);
-            cb(/*kind=*/1, &state->free_queue, sizeof(PhaseBuffer));
+            L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm, num_cores, t);
+            cb(/*kind=*/1, &state->free_queue, sizeof(L2SwimlaneAicpuPhaseBuffer));
         }
     }
 };
@@ -182,13 +187,13 @@ struct L2PerfModule {
 // alloc / free are std::function so callers bind their MemoryAllocator via
 // lambda capture; register / unregister stay as plain function pointers
 // because they wrap stateless HAL globals (halHost*).
-using L2PerfAllocCallback = profiling_common::ProfAllocCallback;
-using L2PerfRegisterCallback = profiling_common::ProfRegisterCallback;
-using L2PerfUnregisterCallback = profiling_common::ProfUnregisterCallback;
-using L2PerfFreeCallback = profiling_common::ProfFreeCallback;
+using L2SwimlaneAllocCallback = profiling_common::ProfAllocCallback;
+using L2SwimlaneRegisterCallback = profiling_common::ProfRegisterCallback;
+using L2SwimlaneUnregisterCallback = profiling_common::ProfUnregisterCallback;
+using L2SwimlaneFreeCallback = profiling_common::ProfFreeCallback;
 
 // =============================================================================
-// L2PerfCollector
+// L2SwimlaneCollector
 // =============================================================================
 
 /**
@@ -206,7 +211,7 @@ using L2PerfFreeCallback = profiling_common::ProfFreeCallback;
  *                                    (mgmt first so its final-drain entries
  *                                    have a consumer).
  *   5. read_phase_header_metadata() — single-shot read of the core→thread
- *                                    mapping from AicpuPhaseHeader.
+ *                                    mapping from L2SwimlaneAicpuPhaseHeader.
  *   6. reconcile_counters()        — device-side three-bucket accounting for
  *                                    both PERF and PHASE pools (total /
  *                                    collected / dropped).
@@ -216,33 +221,33 @@ using L2PerfFreeCallback = profiling_common::ProfFreeCallback;
  * device flush is the only data path. Any non-zero `current_buf_ptr` after
  * stop() is logged as a bug.
  */
-class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L2PerfModule> {
+class L2SwimlaneCollector : public profiling_common::ProfilerBase<L2SwimlaneCollector, L2SwimlaneModule> {
 public:
-    L2PerfCollector() = default;
-    ~L2PerfCollector();
+    L2SwimlaneCollector() = default;
+    ~L2SwimlaneCollector();
 
-    L2PerfCollector(const L2PerfCollector &) = delete;
-    L2PerfCollector &operator=(const L2PerfCollector &) = delete;
+    L2SwimlaneCollector(const L2SwimlaneCollector &) = delete;
+    L2SwimlaneCollector &operator=(const L2SwimlaneCollector &) = delete;
 
     // ProfilerBase contract
     static constexpr int kIdleTimeoutSec = PLATFORM_PROF_TIMEOUT_SECONDS;
-    static constexpr const char *kSubsystemName = "L2Perf";
+    static constexpr const char *kSubsystemName = "L2Swimlane";
 
     /**
      * Initialize performance profiling.
      *
      * Allocates the shared-memory region (header + per-core / per-thread
-     * BufferStates), pre-allocates initial L2PerfBuffers and PhaseBuffers,
+     * BufferStates), pre-allocates initial L2SwimlaneAicpuTaskBuffers and PhaseBuffers,
      * and seeds the per-pool free_queues + the framework's recycled pools.
      *
      * @param num_aicore               Number of AICore instances
      * @param device_id                Device ID (forwarded to register_cb)
-     * @param l2_perf_level   Collection granularity (DISABLED / AICORE_TIMING
+     * @param l2_swimlane_level   Collection granularity (DISABLED / AICORE_TIMING
      *                                 / AICPU_TIMING / SCHED_PHASES / ORCH_PHASES).
      *                                 Written into
-     *                                 `L2PerfDataHeader::l2_perf_level`
+     *                                 `L2SwimlaneDataHeader::l2_swimlane_level`
      *                                 so AICPU can promote it in
-     *                                 `l2_perf_aicpu_init`, AND cached on the
+     *                                 `l2_swimlane_aicpu_init`, AND cached on the
      *                                 collector so `export_swimlane_json()`
      *                                 can gate phase sections and stamp the
      *                                 JSON `version`.
@@ -251,28 +256,28 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
      *                                 simulation)
      * @param free_cb                  Device memory free callback
      * @param user_data                Opaque pointer forwarded to callbacks
-     * @param output_prefix            Per-task directory; l2_perf_records.json
+     * @param output_prefix            Per-task directory; l2_swimlane_records.json
      *                                 lands here. Required (non-empty);
      *                                 CallConfig::validate() enforces this
      *                                 upstream.
      * @return 0 on success, error code on failure
      */
     int initialize(
-        int num_aicore, int device_id, L2PerfLevel l2_perf_level, const L2PerfAllocCallback &alloc_cb,
-        L2PerfRegisterCallback register_cb, const L2PerfFreeCallback &free_cb, const std::string &output_prefix
+        int num_aicore, int device_id, L2SwimlaneLevel l2_swimlane_level, const L2SwimlaneAllocCallback &alloc_cb,
+        L2SwimlaneRegisterCallback register_cb, const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix
     );
 
     /**
      * Per-buffer callback invoked by ProfilerBase's poll loop. Dispatches on
-     * info.type to copy either an L2PerfBuffer (PERF_RECORD) into the per-core
-     * record vector, or a PhaseBuffer (PHASE) into the per-thread phase-record
+     * info.type to copy either an L2SwimlaneAicpuTaskBuffer (PERF_RECORD) into the per-core
+     * record vector, or a L2SwimlaneAicpuPhaseBuffer (PHASE) into the per-thread phase-record
      * vector.
      */
     void on_buffer_collected(const ReadyBufferInfo &info);
 
     /**
      * Export collected records as a Chrome Trace Event JSON (swimlane view).
-     * Writes <output_prefix>/l2_perf_records.json — directory is captured at
+     * Writes <output_prefix>/l2_swimlane_records.json — directory is captured at
      * initialize() time.
      *
      * @return 0 on success, error code on failure
@@ -288,7 +293,7 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
      * @param user_data      Opaque pointer forwarded to callbacks
      * @return 0 on success, error code on failure
      */
-    int finalize(L2PerfUnregisterCallback unregister_cb, const L2PerfFreeCallback &free_cb);
+    int finalize(L2SwimlaneUnregisterCallback unregister_cb, const L2SwimlaneFreeCallback &free_cb);
 
     /**
      * @return true if initialize() succeeded and finalize() has not run.
@@ -296,24 +301,25 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
     bool is_initialized() const { return shm_host_ != nullptr; }
 
     /**
-     * Device pointer to the L2PerfDataHeader. Set kernel_args.l2_perf_data_base
+     * Device pointer to the L2SwimlaneDataHeader. Set kernel_args.l2_swimlane_data_base
      * to this after initialize() succeeds so the AICPU side can find the
      * shared memory.
      */
-    void *get_l2_perf_setup_device_ptr() const { return perf_shared_mem_dev_; }
+    void *get_l2_swimlane_setup_device_ptr() const { return perf_shared_mem_dev_; }
 
     /**
-     * Device pointer to a uint64_t[num_aicore] table where each entry is
-     * `L2PerfBufferState[i].aicore_ring_ptr`. Allocated and populated by
-     * initialize(); freed by finalize(). Set kernel_args.aicore_ring_addr
-     * to this so the AICore kernel entry can index by block_idx and feed
-     * the per-core ring into the platform's set_l2_perf_aicore_ring().
-     * Returns nullptr before initialize() succeeds.
+     * Device pointer to a uint64_t[num_aicore] table where each entry will
+     * hold this core's `&L2SwimlaneAicoreTaskPool::rotation` device address. Host
+     * only allocates the bytes here; AICPU populates the entries inside
+     * `l2_swimlane_aicpu_init`. Freed by finalize(). Set kernel_args.l2_swimlane_aicore_rotation_table
+     * to this so the AICore kernel entry can index by block_idx and feed the
+     * per-core rotation channel into `set_l2_swimlane_aicore_rotation_slot()`. Returns
+     * nullptr before initialize() succeeds.
      */
     void *get_aicore_ring_addr_table_device_ptr() const { return aicore_ring_addr_table_dev_; }
 
     /**
-     * Read AICPU phase metadata that lives in AicpuPhaseHeader (not on the
+     * Read AICPU phase metadata that lives in L2SwimlaneAicpuPhaseHeader (not on the
      * buffer pipeline): the core→thread mapping plus a has-data signal
      * derived from accumulated per-event records. Single-shot — must be
      * called after stop() so the shm region has settled.
@@ -333,38 +339,38 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
     void reconcile_counters();
 
     /**
-     * @return Per-core L2PerfRecord vectors (indexed by core_index). For tests.
+     * @return Per-core L2SwimlaneAicpuTaskRecord vectors (indexed by core_index). For tests.
      */
-    const std::vector<std::vector<L2PerfRecord>> &get_records() const { return collected_perf_records_; }
+    const std::vector<std::vector<L2SwimlaneAicpuTaskRecord>> &get_records() const { return collected_perf_records_; }
 
 private:
     // Shared memory pointers. shm_host_ / device_id_ live on ProfilerBase
     // (set via set_memory_context in initialize()).
     void *perf_shared_mem_dev_{nullptr};
 
-    // Standalone uint64_t[num_aicore] table holding per-core L2PerfAicoreBuffer
+    // Standalone uint64_t[num_aicore] table holding per-core L2SwimlaneAicoreTaskBuffer
     // addresses. Allocated in initialize(), freed in finalize(). AICore reads
-    // ring_table[block_idx] via KernelArgs::aicore_ring_addr.
+    // ring_table[block_idx] via KernelArgs::l2_swimlane_aicore_rotation_table.
     void *aicore_ring_addr_table_dev_{nullptr};
 
     int num_aicore_{0};
-    L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
 
     // Per-task output directory captured at initialize() time. Consumed by
-    // export_swimlane_json() to build <prefix>/l2_perf_records.json.
+    // export_swimlane_json() to build <prefix>/l2_swimlane_records.json.
     std::string output_prefix_;
 
     // Collected data (per-core vectors, indexed by core_index)
-    std::vector<std::vector<L2PerfRecord>> collected_perf_records_;
+    std::vector<std::vector<L2SwimlaneAicpuTaskRecord>> collected_perf_records_;
 
     // Collected AICore records (per-core vectors). Each entry is a full
-    // L2PerfAicoreRecord captured from a rotated L2PerfAicoreBuffer. The
+    // L2SwimlaneAicoreTaskRecord captured from a rotated L2SwimlaneAicoreTaskBuffer. The
     // order across rotations is preserved by `copy_aicore_buffer` (we sort
     // incoming buffers by buffer_seq before flattening).
-    std::vector<std::vector<L2PerfAicoreRecord>> collected_aicore_records_;
+    std::vector<std::vector<L2SwimlaneAicoreTaskRecord>> collected_aicore_records_;
 
     // AICPU phase profiling data (per-thread, mixed sched + orch records)
-    std::vector<std::vector<AicpuPhaseRecord>> collected_phase_records_;
+    std::vector<std::vector<L2SwimlaneAicpuPhaseRecord>> collected_phase_records_;
     bool has_phase_data_{false};
 
     // Core-to-thread mapping (core_id → scheduler thread index, -1 = unassigned)
@@ -374,7 +380,7 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
     uint64_t total_perf_collected_{0};
     uint64_t total_phase_collected_{0};
 
-    // Allocate a single buffer (L2PerfBuffer or PhaseBuffer) and register it.
+    // Allocate a single buffer (L2SwimlaneAicpuTaskBuffer or L2SwimlaneAicpuPhaseBuffer) and register it.
     // The RAII counterpart ``release_one_buffer`` lives on ProfilerBase and
     // is shared with every other collector.
     void *alloc_single_buffer(size_t size, void **host_ptr_out);
@@ -385,13 +391,14 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
     void copy_aicore_buffer(const ReadyBufferInfo &info);
 
     // AICore-as-producer: AICore writes start/end/task_id directly into a
-    // per-core L2PerfAicoreBuffer (allocated by initialize(), addressed via
-    // state->aicore_ring_ptr). AICPU never reads it on the hot path.
+    // per-core L2SwimlaneAicoreTaskBuffer (allocated by initialize(), addressed via
+    // L2SwimlaneAicoreTaskPool::rotation, which AICPU rotates per BUFFER_SIZE
+    // completion). AICPU never reads the AICore records on the hot path.
     // join_aicore_records() runs after stop(): it walks each core's buffer,
     // builds a `task_id_low32 → (start, end)` map, then patches the matching
-    // L2PerfRecord entries in collected_perf_records_. Called from
+    // L2SwimlaneAicpuTaskRecord entries in collected_perf_records_. Called from
     // export_swimlane_json() so external callers see a transparent stream.
     void join_aicore_records();
 };
 
-#endif  // SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_
+#endif  // SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_
diff --git a/src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h b/src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h
index 670803f9f..54673c07a 100644
--- a/src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h
+++ b/src/a2a3/platform/include/host/profiling_common/buffer_pool_manager.h
@@ -11,7 +11,7 @@
 
 /**
  * @file buffer_pool_manager.h
- * @brief Generic buffer-pool data structure shared by L2Perf, TensorDump,
+ * @brief Generic buffer-pool data structure shared by L2Swimlane, TensorDump,
  *        and PMU collectors. Owns:
  *
  *   - ready_queue (mgmt → collector) with mutex/cv,
@@ -333,7 +333,7 @@ class BufferPoolManager {
     // dev → host mapping (single source of truth for resolve_host_ptr)
     std::unordered_map<void *, void *> dev_to_host_;
 
-    // Per-kind recycled buffer pools (vector indexed by Module's BufferKind id)
+    // Per-kind recycled buffer pools (vector indexed by Module-defined kind id)
     std::vector<std::vector<void *>> recycled_;
 };
 
diff --git a/src/a2a3/platform/include/host/profiling_common/profiler_base.h b/src/a2a3/platform/include/host/profiling_common/profiler_base.h
index fbf4cebe2..e9b06f7d1 100644
--- a/src/a2a3/platform/include/host/profiling_common/profiler_base.h
+++ b/src/a2a3/platform/include/host/profiling_common/profiler_base.h
@@ -11,7 +11,7 @@
 
 /**
  * @file profiler_base.h
- * @brief CRTP scaffolding shared by L2Perf / Dump / PMU collectors.
+ * @brief CRTP scaffolding shared by L2Swimlane / Dump / PMU collectors.
  *
  * Owns the BufferPoolManager<Module>, the mgmt thread (which polls AICPU
  * ready queues and recycles buffers), and the collector poll thread.
@@ -19,12 +19,12 @@
  * Module concept contract
  * -----------------------
  *
- * Each profiling subsystem provides a `Module` struct (e.g., L2PerfModule,
+ * Each profiling subsystem provides a `Module` struct (e.g., L2SwimlaneModule,
  * DumpModule, PmuModule) that supplies the data-layout traits the unified
  * mgmt-loop algorithms (ProfilerAlgorithms<Module>) need. Required members:
  *
  *   // Types
- *   using DataHeader      = ...;   // Shared-memory header (e.g. L2PerfDataHeader).
+ *   using DataHeader      = ...;   // Shared-memory header (e.g. L2SwimlaneDataHeader).
  *   using ReadyEntry      = ...;   // Per-AICPU-thread ready-queue entry.
  *   using ReadyBufferInfo = ...;   // Hand-off struct to the collector thread
  *                                  // (carries dev/host ptrs, optional kind
@@ -34,10 +34,10 @@
  *                                  // `buffer_ptrs[kSlotCount]`.
  *
  *   // Constants
- *   static constexpr int      kBufferKinds;    // L2Perf=2 (perf+phase), Dump=1, PMU=1.
+ *   static constexpr int      kBufferKinds;    // L2Swimlane=2 (perf+phase), Dump=1, PMU=1.
  *   static constexpr uint32_t kReadyQueueSize; // Per-thread ready-queue depth.
  *   static constexpr uint32_t kSlotCount;      // FreeQueue::buffer_ptrs[] length.
- *   static constexpr const char* kSubsystemName; // "PMU" / "L2Perf" / "Dump".
+ *   static constexpr const char* kSubsystemName; // "PMU" / "L2Swimlane" / "Dump".
  *
  *   // Header pointer cast (host_ptr → DataHeader*)
  *   static DataHeader* header_from_shm(void* shared_mem_host);
@@ -117,7 +117,7 @@
  *       (use the subsystem's PLATFORM_*_TIMEOUT_SECONDS).
  *
  *   static constexpr const char*  kSubsystemName;
- *       Used in the idle-timeout log line (e.g. "L2Perf", "PMU", "TensorDump").
+ *       Used in the idle-timeout log line (e.g. "L2Swimlane", "PMU", "TensorDump").
  */
 
 #ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_PROFILING_COMMON_PROFILER_BASE_H_
@@ -138,7 +138,7 @@
 namespace profiling_common {
 
 // Common subsystem callback signatures. All four collectors (PMU / TensorDump
-// / L2Perf / DepGen) used to declare their own typedefs with identical
+// / L2Swimlane / DepGen) used to declare their own typedefs with identical
 // shapes; these are the canonical types stashed in ProfilerBase via
 // set_memory_context().
 //
diff --git a/src/a2a3/platform/onboard/aicore/kernel.cpp b/src/a2a3/platform/onboard/aicore/kernel.cpp
index e87cb81ef..5f504f54d 100644
--- a/src/a2a3/platform/onboard/aicore/kernel.cpp
+++ b/src/a2a3/platform/onboard/aicore/kernel.cpp
@@ -15,7 +15,7 @@
 #include "aicore/aicore_profiling_state.h"
 #include "common/core_type.h"
 #include "common/kernel_args.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 
 #ifdef __DAV_VEC__
 #define KERNEL_ENTRY(x) \
@@ -45,25 +45,26 @@
 [[block_local]] static uint32_t s_aicore_profiling_flag;
 // Slot pointer (NOT the dereferenced rotation address) — see
 // aicore_profiling_state.h for the lazy-deref contract.
-[[block_local]] static __gm__ uint64_t *s_aicore_rotation_slot;
-[[block_local]] static __gm__ AicoreRotation *s_aicore_rotation;
+[[block_local]] static __gm__ uint64_t *s_l2_swimlane_aicore_rotation_slot;
+[[block_local]] static __gm__ L2SwimlaneAicoreRotation *s_l2_swimlane_aicore_rotation;
 
 __attribute__((weak)) __aicore__ void set_aicore_profiling_flag(uint32_t flag) { s_aicore_profiling_flag = flag; }
 __attribute__((weak)) __aicore__ uint32_t get_aicore_profiling_flag() { return s_aicore_profiling_flag; }
 
-__attribute__((weak)) __aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) {
-    s_aicore_rotation_slot = slot_ptr;
-    s_aicore_rotation = nullptr;  // force lazy resolution on next get
+__attribute__((weak)) __aicore__ void set_l2_swimlane_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) {
+    s_l2_swimlane_aicore_rotation_slot = slot_ptr;
+    s_l2_swimlane_aicore_rotation = nullptr;  // force lazy resolution on next get
 }
-__attribute__((weak)) __aicore__ __gm__ AicoreRotation *get_aicore_rotation() {
-    // Lazy first-call resolve: AICPU init populates `*s_aicore_rotation_slot`
+__attribute__((weak)) __aicore__ __gm__ L2SwimlaneAicoreRotation *get_l2_swimlane_aicore_rotation() {
+    // Lazy first-call resolve: AICPU init populates `*s_l2_swimlane_aicore_rotation_slot`
     // before dispatching the first task, so by the time the executor reaches
     // for the rotation (inside the first-task branch of the dispatch poll)
     // the slot holds a valid device address.
-    if (s_aicore_rotation == nullptr && s_aicore_rotation_slot != nullptr) {
-        s_aicore_rotation = reinterpret_cast<__gm__ AicoreRotation *>(*s_aicore_rotation_slot);
+    if (s_l2_swimlane_aicore_rotation == nullptr && s_l2_swimlane_aicore_rotation_slot != nullptr) {
+        s_l2_swimlane_aicore_rotation =
+            reinterpret_cast<__gm__ L2SwimlaneAicoreRotation *>(*s_l2_swimlane_aicore_rotation_slot);
     }
-    return s_aicore_rotation;
+    return s_l2_swimlane_aicore_rotation;
 }
 
 extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type);
@@ -101,17 +102,18 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ KernelA
 
     // Publish per-core profiling state into platform-owned slots before the
     // executor runs. AICore reads via get_aicore_profiling_flag() /
-    // get_aicore_rotation() — never touches Handshake for profiling.
+    // get_l2_swimlane_aicore_rotation() — never touches Handshake for profiling.
     set_aicore_profiling_flag(k_args->enable_profiling_flag);
     if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)) {
         // Stash only the slot pointer. The slot CONTENTS are written by
-        // AICPU's `l2_perf_aicpu_init` which runs concurrently with this
+        // AICPU's `l2_swimlane_aicpu_init` which runs concurrently with this
         // entry; dereferencing here would race with AICPU's write. The
-        // executor defers the deref via `get_aicore_rotation()` until inside
+        // executor defers the deref via `get_l2_swimlane_aicore_rotation()` until inside
         // the first-task branch — by then AICPU has dispatched, so init is
         // done and the slot is populated.
-        __gm__ uint64_t *rotation_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_ring_addr);
-        set_aicore_rotation_slot(rotation_table != nullptr ? &rotation_table[block_idx] : nullptr);
+        __gm__ uint64_t *rotation_table =
+            reinterpret_cast<__gm__ uint64_t *>(k_args->l2_swimlane_aicore_rotation_table);
+        set_l2_swimlane_aicore_rotation_slot(rotation_table != nullptr ? &rotation_table[block_idx] : nullptr);
     }
 
     aicore_execute(k_args->runtime_args, block_idx, core_type);
diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp
index 7926fa0e7..2a5d99053 100644
--- a/src/a2a3/platform/onboard/aicpu/kernel.cpp
+++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp
@@ -16,7 +16,7 @@
 #include "aicpu/dep_gen_collector_aicpu.h"
 #include "aicpu/device_log.h"
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
 #include "aicpu/platform_aicpu_affinity.h"
@@ -109,8 +109,8 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a
     set_orch_device_id(static_cast<int>(k_args->device_id));
     set_platform_dump_base(k_args->dump_data_base);
     set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR));
-    set_platform_l2_perf_base(k_args->l2_perf_data_base);
-    set_platform_aicore_rotation_table(k_args->aicore_ring_addr);
+    set_platform_l2_swimlane_base(k_args->l2_swimlane_data_base);
+    set_platform_l2_swimlane_aicore_rotation_table(k_args->l2_swimlane_aicore_rotation_table);
     set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE));
     set_platform_pmu_base(k_args->pmu_data_base);
     set_platform_pmu_reg_addrs(k_args->pmu_reg_addrs);
diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt
index 65b238589..78d8f4097 100644
--- a/src/a2a3/platform/onboard/host/CMakeLists.txt
+++ b/src/a2a3/platform/onboard/host/CMakeLists.txt
@@ -56,7 +56,7 @@ list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_swimlane_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/comm_hccl.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp"
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index dfa0b9d63..9a01c133a 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -253,9 +253,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     // Initialize per-subsystem shared memory.
     if (enable_l2_swimlane_) {
-        rc = init_l2_perf(num_aicore, device_id_);
+        rc = init_l2_swimlane(num_aicore, device_id_);
         if (rc != 0) {
-            LOG_ERROR("init_l2_perf failed: %d", rc);
+            LOG_ERROR("init_l2_swimlane failed: %d", rc);
             return rc;
         }
     }
@@ -296,7 +296,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     // On any exit from run() — success or early error — release the diagnostics
     // collectors' shared memory. They are only re-initialized per run(), so a
     // Worker reused across runs (e.g. a pytest session-scoped worker pool) would
-    // otherwise re-enter init_l2_perf() with stale state still allocated.
+    // otherwise re-enter init_l2_swimlane() with stale state still allocated.
     auto perf_cleanup = RAIIScopeGuard([this]() {
         finalize_collectors();
     });
@@ -457,7 +457,7 @@ int DeviceRunner::finalize() {
 
 // `launch_aicpu_kernel` and `launch_aicore_kernel` live on `DeviceRunnerBase`.
 
-int DeviceRunner::init_l2_perf(int num_aicore, int device_id) {
+int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) {
     auto alloc_cb = [this](size_t size) -> void * {
         return mem_alloc_.alloc(size);
     };
@@ -479,16 +479,17 @@ int DeviceRunner::init_l2_perf(int num_aicore, int device_id) {
         return mem_alloc_.free(dev_ptr);
     };
 
-    int rc = l2_perf_collector_.initialize(
-        num_aicore, device_id, l2_perf_level_, alloc_cb, register_cb, free_cb, output_prefix_
+    int rc = l2_swimlane_collector_.initialize(
+        num_aicore, device_id, l2_swimlane_level_, alloc_cb, register_cb, free_cb, output_prefix_
     );
     if (rc != 0) {
         return rc;
     }
 
-    kernel_args_.args.l2_perf_data_base = reinterpret_cast<uint64_t>(l2_perf_collector_.get_l2_perf_setup_device_ptr());
-    kernel_args_.args.aicore_ring_addr =
-        reinterpret_cast<uint64_t>(l2_perf_collector_.get_aicore_ring_addr_table_device_ptr());
+    kernel_args_.args.l2_swimlane_data_base =
+        reinterpret_cast<uint64_t>(l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr());
+    kernel_args_.args.l2_swimlane_aicore_rotation_table =
+        reinterpret_cast<uint64_t>(l2_swimlane_collector_.get_aicore_ring_addr_table_device_ptr());
     return 0;
 }
 
@@ -634,8 +635,8 @@ void DeviceRunner::finalize_collectors() {
         return mem_alloc_.free(dev_ptr);
     };
 
-    if (l2_perf_collector_.is_initialized()) {
-        l2_perf_collector_.finalize(unregister_cb, free_cb);
+    if (l2_swimlane_collector_.is_initialized()) {
+        l2_swimlane_collector_.finalize(unregister_cb, free_cb);
     }
     if (dump_collector_.is_initialized()) {
         dump_collector_.finalize(unregister_cb, free_cb);
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index ea2358a72..9a5b65b3f 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -40,7 +40,7 @@
 #include "prepare_callable_common.h"
 #include "common/kernel_args.h"
 #include "common/memory_barrier.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 #include "device_arena.h"
@@ -48,7 +48,7 @@
 #include "device_runner_helpers.h"  // common DeviceArgs + KernelArgsHelper
 #include "host/function_cache.h"
 #include "host/memory_allocator.h"
-#include "host/l2_perf_collector.h"
+#include "host/l2_swimlane_collector.h"
 #include "host/tensor_dump_collector.h"
 #include "host/pmu_collector.h"
 #include "host/dep_gen_collector.h"
@@ -195,7 +195,7 @@ class DeviceRunner : public DeviceRunnerBase {
     // acl_ready_, so runtimes that never ask for ACL (e.g. pure rt-layer) stay unaffected.
     bool acl_ready_{false};
 
-    // Shared collectors (`l2_perf_collector_`, `dump_collector_`,
+    // Shared collectors (`l2_swimlane_collector_`, `dump_collector_`,
     // `pmu_collector_`, `scope_stats_collector_`) live on `DeviceRunnerBase`.
     //
     // dep_gen collector — captures orchestrator submit_task inputs for
@@ -217,7 +217,7 @@ class DeviceRunner : public DeviceRunnerBase {
      * @param device_id Device ID for host registration
      * @return 0 on success, error code on failure
      */
-    int init_l2_perf(int num_aicore, int device_id);
+    int init_l2_swimlane(int num_aicore, int device_id);
 
     /**
      * Initialize tensor dump shared memory and collector.
@@ -274,7 +274,7 @@ class DeviceRunner : public DeviceRunnerBase {
      */
     void finalize_collectors();
     // Shared enable flags (`enable_l2_swimlane_`, `enable_dump_tensor_`,
-    // `enable_pmu_`, `enable_scope_stats_`, `l2_perf_level_`,
+    // `enable_pmu_`, `enable_scope_stats_`, `l2_swimlane_level_`,
     // `pmu_event_type_`, `output_prefix_`) live on `DeviceRunnerBase`.
     //
     // dep_gen enablement is a2a3-only.
diff --git a/src/a2a3/platform/sim/aicore/inner_kernel.h b/src/a2a3/platform/sim/aicore/inner_kernel.h
index 4f41e10a6..b92e2e279 100644
--- a/src/a2a3/platform/sim/aicore/inner_kernel.h
+++ b/src/a2a3/platform/sim/aicore/inner_kernel.h
@@ -38,12 +38,12 @@
 //   - with CACHELINE_OUT: write-back/flush (write to memory) -> release semantics
 // On aarch64, acquire-only fences do NOT prevent store-store reordering across the
 // barrier, so using acquire for the flush direction causes a race: the AICPU can
-// observe the COND register FIN signal before l2_perf_buf->count is visible.
+// observe the COND register FIN signal before l2_swimlane_buf->count is visible.
 // Using seq_cst (dmb ish / full barrier) covers both directions safely.
 // Use variadic macro to support both 2-arg and 3-arg calls.
 #define dcci(...) std::atomic_thread_fence(std::memory_order_seq_cst)
 
-// dsb / mem_dsb_t — CANN provides these on real AICore; l2_perf_collector uses them after dcci flush.
+// dsb / mem_dsb_t — CANN provides these on real AICore; l2_swimlane_collector uses them after dcci flush.
 // Simulation: full fence (same strength as dcci above) so AICPU ordering matches hardware intent.
 typedef int mem_dsb_t;
 #define dsb(_kind)                                           \
diff --git a/src/a2a3/platform/sim/aicore/kernel.cpp b/src/a2a3/platform/sim/aicore/kernel.cpp
index 033682d2b..4607f1526 100644
--- a/src/a2a3/platform/sim/aicore/kernel.cpp
+++ b/src/a2a3/platform/sim/aicore/kernel.cpp
@@ -23,7 +23,7 @@
 #include "aicore/aicore.h"
 #include "aicore/aicore_profiling_state.h"
 #include "common/core_type.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "runtime.h"
 
@@ -35,16 +35,16 @@ static pthread_key_t g_core_id_key;
 static pthread_key_t g_aicore_profiling_flag_key;
 // Slot pointer (NOT the dereferenced rotation address) — see
 // aicore_profiling_state.h for the lazy-deref contract.
-static pthread_key_t g_aicore_rotation_slot_key;
-static pthread_key_t g_aicore_rotation_key;
+static pthread_key_t g_l2_swimlane_aicore_rotation_slot_key;
+static pthread_key_t g_l2_swimlane_aicore_rotation_key;
 static pthread_once_t g_tls_once = PTHREAD_ONCE_INIT;
 
 static void create_tls_keys() {
     pthread_key_create(&g_reg_base_key, nullptr);
     pthread_key_create(&g_core_id_key, nullptr);
     pthread_key_create(&g_aicore_profiling_flag_key, nullptr);
-    pthread_key_create(&g_aicore_rotation_slot_key, nullptr);
-    pthread_key_create(&g_aicore_rotation_key, nullptr);
+    pthread_key_create(&g_l2_swimlane_aicore_rotation_slot_key, nullptr);
+    pthread_key_create(&g_l2_swimlane_aicore_rotation_key, nullptr);
 }
 
 volatile uint8_t *sim_get_reg_base() { return static_cast<volatile uint8_t *>(pthread_getspecific(g_reg_base_key)); }
@@ -65,18 +65,19 @@ __aicore__ uint32_t get_aicore_profiling_flag() {
     return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(pthread_getspecific(g_aicore_profiling_flag_key)));
 }
 
-__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) {
-    pthread_setspecific(g_aicore_rotation_slot_key, reinterpret_cast<void *>(slot_ptr));
-    pthread_setspecific(g_aicore_rotation_key, nullptr);  // force lazy resolve on next get
+__aicore__ void set_l2_swimlane_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) {
+    pthread_setspecific(g_l2_swimlane_aicore_rotation_slot_key, reinterpret_cast<void *>(slot_ptr));
+    pthread_setspecific(g_l2_swimlane_aicore_rotation_key, nullptr);  // force lazy resolve on next get
 }
-__aicore__ __gm__ AicoreRotation *get_aicore_rotation() {
-    auto *cached = reinterpret_cast<__gm__ AicoreRotation *>(pthread_getspecific(g_aicore_rotation_key));
+__aicore__ __gm__ L2SwimlaneAicoreRotation *get_l2_swimlane_aicore_rotation() {
+    auto *cached =
+        reinterpret_cast<__gm__ L2SwimlaneAicoreRotation *>(pthread_getspecific(g_l2_swimlane_aicore_rotation_key));
     if (cached != nullptr) return cached;
-    auto *slot = reinterpret_cast<__gm__ uint64_t *>(pthread_getspecific(g_aicore_rotation_slot_key));
+    auto *slot = reinterpret_cast<__gm__ uint64_t *>(pthread_getspecific(g_l2_swimlane_aicore_rotation_slot_key));
     if (slot == nullptr) return nullptr;
     // Lazy first-call resolve — see aicore_profiling_state.h.
-    cached = reinterpret_cast<__gm__ AicoreRotation *>(*slot);
-    pthread_setspecific(g_aicore_rotation_key, reinterpret_cast<void *>(cached));
+    cached = reinterpret_cast<__gm__ L2SwimlaneAicoreRotation *>(*slot);
+    pthread_setspecific(g_l2_swimlane_aicore_rotation_key, reinterpret_cast<void *>(cached));
     return cached;
 }
 
@@ -102,7 +103,7 @@ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type);
 // executor with its original signature.
 extern "C" void aicore_execute_wrapper(
     __gm__ Runtime *runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs,
-    uint32_t enable_profiling_flag, uint64_t aicore_ring_addr
+    uint32_t enable_profiling_flag, uint64_t l2_swimlane_aicore_rotation_table
 ) {
     pthread_once(&g_tls_once, create_tls_keys);
 
@@ -118,14 +119,14 @@ extern "C" void aicore_execute_wrapper(
 
     // Publish per-core profiling state before the executor runs.
     set_aicore_profiling_flag(enable_profiling_flag);
-    if (aicore_ring_addr != 0) {
+    if (l2_swimlane_aicore_rotation_table != 0) {
         // Stash only the slot pointer; deref happens lazily inside
-        // get_aicore_rotation() once AICPU has populated the table. See
+        // get_l2_swimlane_aicore_rotation() once AICPU has populated the table. See
         // aicore_profiling_state.h.
-        uint64_t *rotation_table = reinterpret_cast<uint64_t *>(aicore_ring_addr);
-        set_aicore_rotation_slot(reinterpret_cast<__gm__ uint64_t *>(&rotation_table[block_idx]));
+        uint64_t *rotation_table = reinterpret_cast<uint64_t *>(l2_swimlane_aicore_rotation_table);
+        set_l2_swimlane_aicore_rotation_slot(reinterpret_cast<__gm__ uint64_t *>(&rotation_table[block_idx]));
     } else {
-        set_aicore_rotation_slot(nullptr);
+        set_l2_swimlane_aicore_rotation_slot(nullptr);
     }
 
     // Set core identity for pto-isa TPUSH/TPOP simulation.
diff --git a/src/a2a3/platform/sim/host/CMakeLists.txt b/src/a2a3/platform/sim/host/CMakeLists.txt
index e32931c81..f55a68ce4 100644
--- a/src/a2a3/platform/sim/host/CMakeLists.txt
+++ b/src/a2a3/platform/sim/host/CMakeLists.txt
@@ -43,7 +43,7 @@ list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_swimlane_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/dep_gen_collector.cpp"
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index a6a78bdc2..f350cec22 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -63,7 +63,7 @@ extern "C" __attribute__((weak, visibility("hidden"))) int dep_gen_replay_emit_d
 typedef int (*aicpu_execute_func_t)(Runtime *runtime);
 typedef void (*aicore_execute_func_t)(
     Runtime *runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs,
-    uint32_t enable_profiling_flag, uint64_t aicore_ring_addr
+    uint32_t enable_profiling_flag, uint64_t l2_swimlane_aicore_rotation_table
 );
 typedef void (*set_platform_regs_func_t)(uint64_t regs);
 typedef void (*set_platform_dump_base_func_t)(uint64_t dump_data_base);
@@ -266,17 +266,18 @@ int DeviceRunner::ensure_binaries_loaded() {
             return -1;
         }
 
-        set_platform_l2_perf_base_func_ =
-            reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_l2_perf_base"));
-        if (set_platform_l2_perf_base_func_ == nullptr) {
-            LOG_ERROR("dlsym failed for set_platform_l2_perf_base: %s", dlerror());
+        set_platform_l2_swimlane_base_func_ =
+            reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_l2_swimlane_base"));
+        if (set_platform_l2_swimlane_base_func_ == nullptr) {
+            LOG_ERROR("dlsym failed for set_platform_l2_swimlane_base: %s", dlerror());
             return -1;
         }
 
-        set_platform_aicore_rotation_table_func_ =
-            reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_aicore_rotation_table"));
+        set_platform_aicore_rotation_table_func_ = reinterpret_cast<void (*)(uint64_t)>(
+            dlsym(aicpu_so_handle_, "set_platform_l2_swimlane_aicore_rotation_table")
+        );
         if (set_platform_aicore_rotation_table_func_ == nullptr) {
-            LOG_ERROR("dlsym failed for set_platform_aicore_rotation_table: %s", dlerror());
+            LOG_ERROR("dlsym failed for set_platform_l2_swimlane_aicore_rotation_table: %s", dlerror());
             return -1;
         }
 
@@ -523,9 +524,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     // Initialize per-subsystem shared memory.
     if (enable_l2_swimlane_) {
-        rc = init_l2_perf(num_aicore, device_id_);
+        rc = init_l2_swimlane(num_aicore, device_id_);
         if (rc != 0) {
-            LOG_ERROR("init_l2_perf failed: %d", rc);
+            LOG_ERROR("init_l2_swimlane failed: %d", rc);
             return rc;
         }
     }
@@ -566,7 +567,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     // On any exit from run() — success or early error — release the diagnostics
     // collectors' shared memory. They are only re-initialized per run(), so a
     // Worker reused across runs (e.g. a pytest session-scoped worker pool) would
-    // otherwise re-enter init_l2_perf() with stale state still allocated.
+    // otherwise re-enter init_l2_swimlane() with stale state still allocated.
     auto perf_cleanup = RAIIScopeGuard([this]() {
         finalize_collectors();
     });
@@ -658,8 +659,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     set_platform_regs_func_(kernel_args_.regs);
     set_platform_dump_base_func_(kernel_args_.dump_data_base);
     set_dump_tensor_enabled_func_(enable_dump_tensor_);
-    set_platform_l2_perf_base_func_(kernel_args_.l2_perf_data_base);
-    set_platform_aicore_rotation_table_func_(kernel_args_.aicore_ring_addr);
+    set_platform_l2_swimlane_base_func_(kernel_args_.l2_swimlane_data_base);
+    set_platform_aicore_rotation_table_func_(kernel_args_.l2_swimlane_aicore_rotation_table);
     set_l2_swimlane_enabled_func_(enable_l2_swimlane_);
     set_platform_pmu_base_func_(kernel_args_.pmu_data_base);
     set_platform_pmu_reg_addrs_func_(kernel_args_.pmu_reg_addrs);
@@ -680,7 +681,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return create_thread(std::move(fn));
     };
     if (enable_l2_swimlane_) {
-        l2_perf_collector_.start(thread_factory);
+        l2_swimlane_collector_.start(thread_factory);
     }
     if (enable_dump_tensor_) {
         dump_collector_.start(thread_factory);
@@ -740,7 +741,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         aicore_threads.push_back(create_thread([this, &runtime, i, core_type, physical_core_id]() {
             aicore_execute_func_(
                 &runtime, i, core_type, physical_core_id, kernel_args_.regs, kernel_args_.enable_profiling_flag,
-                kernel_args_.aicore_ring_addr
+                kernel_args_.l2_swimlane_aicore_rotation_table
             );
         }));
     }
@@ -774,10 +775,10 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     // Diagnostic exports use the per-task `output_prefix_` directory the user
     // set on CallConfig (CallConfig::validate() enforces non-empty upstream).
     if (enable_l2_swimlane_) {
-        l2_perf_collector_.stop();
-        l2_perf_collector_.read_phase_header_metadata();
-        l2_perf_collector_.reconcile_counters();
-        l2_perf_collector_.export_swimlane_json();
+        l2_swimlane_collector_.stop();
+        l2_swimlane_collector_.read_phase_header_metadata();
+        l2_swimlane_collector_.reconcile_counters();
+        l2_swimlane_collector_.export_swimlane_json();
     }
 
     if (enable_dump_tensor_) {
@@ -851,7 +852,7 @@ void DeviceRunner::unload_executor_binaries() {
         set_platform_regs_func_ = nullptr;
         set_platform_dump_base_func_ = nullptr;
         set_dump_tensor_enabled_func_ = nullptr;
-        set_platform_l2_perf_base_func_ = nullptr;
+        set_platform_l2_swimlane_base_func_ = nullptr;
         set_platform_aicore_rotation_table_func_ = nullptr;
         set_l2_swimlane_enabled_func_ = nullptr;
         set_platform_pmu_base_func_ = nullptr;
@@ -1225,7 +1226,7 @@ uint64_t DeviceRunner::upload_chip_callable_buffer(const ChipCallable *callable)
 // Performance Profiling Implementation
 // =============================================================================
 
-int DeviceRunner::init_l2_perf(int num_aicore, int device_id) {
+int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) {
     auto alloc_cb = [this](size_t size) -> void * {
         return mem_alloc_.alloc(size);
     };
@@ -1234,16 +1235,17 @@ int DeviceRunner::init_l2_perf(int num_aicore, int device_id) {
     };
 
     // Simulation: dev pointer is directly host-accessible, no register pass-through.
-    int rc = l2_perf_collector_.initialize(
-        num_aicore, device_id, l2_perf_level_, alloc_cb, nullptr, free_cb, output_prefix_
+    int rc = l2_swimlane_collector_.initialize(
+        num_aicore, device_id, l2_swimlane_level_, alloc_cb, nullptr, free_cb, output_prefix_
     );
     if (rc != 0) {
         return rc;
     }
 
-    kernel_args_.l2_perf_data_base = reinterpret_cast<uint64_t>(l2_perf_collector_.get_l2_perf_setup_device_ptr());
-    kernel_args_.aicore_ring_addr =
-        reinterpret_cast<uint64_t>(l2_perf_collector_.get_aicore_ring_addr_table_device_ptr());
+    kernel_args_.l2_swimlane_data_base =
+        reinterpret_cast<uint64_t>(l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr());
+    kernel_args_.l2_swimlane_aicore_rotation_table =
+        reinterpret_cast<uint64_t>(l2_swimlane_collector_.get_aicore_ring_addr_table_device_ptr());
     return 0;
 }
 
@@ -1331,8 +1333,8 @@ void DeviceRunner::finalize_collectors() {
         return mem_alloc_.free(dev_ptr);
     };
 
-    if (l2_perf_collector_.is_initialized()) {
-        l2_perf_collector_.finalize(nullptr, free_cb);
+    if (l2_swimlane_collector_.is_initialized()) {
+        l2_swimlane_collector_.finalize(nullptr, free_cb);
     }
     if (dump_collector_.is_initialized()) {
         dump_collector_.finalize(nullptr, free_cb);
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index f1a44e59b..a25f3cd30 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -49,12 +49,12 @@
 #include "common/core_type.h"
 #include "common/kernel_args.h"
 #include "common/memory_barrier.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 #include "host/function_cache.h"
 #include "host/memory_allocator.h"
-#include "host/l2_perf_collector.h"
+#include "host/l2_swimlane_collector.h"
 #include "host/tensor_dump_collector.h"
 #include "host/pmu_collector.h"
 #include "host/dep_gen_collector.h"
@@ -187,8 +187,8 @@ class DeviceRunner {
      * Runtime struct / run() arg list so all three travel the same way.
      */
     void set_l2_swimlane_enabled(int level) {
-        l2_perf_level_ = static_cast<L2PerfLevel>(level);
-        enable_l2_swimlane_ = (l2_perf_level_ != L2PerfLevel::DISABLED);
+        l2_swimlane_level_ = static_cast<L2SwimlaneLevel>(level);
+        enable_l2_swimlane_ = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
     }
     void set_dump_tensor_enabled(bool enable) { enable_dump_tensor_ = enable; }
     void set_pmu_enabled(int enable_pmu) {
@@ -197,7 +197,7 @@ class DeviceRunner {
     }
     void set_dep_gen_enabled(bool enable) { enable_dep_gen_ = enable; }
     void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; }
-    // Directory under which all diagnostic artifacts (l2_perf_records.json /
+    // Directory under which all diagnostic artifacts (l2_swimlane_records.json /
     // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic
     // is enabled; CallConfig::validate() enforces this contract upstream.
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
@@ -381,7 +381,7 @@ class DeviceRunner {
     void (*set_platform_regs_func_)(uint64_t){nullptr};
     void (*set_platform_dump_base_func_)(uint64_t){nullptr};
     void (*set_dump_tensor_enabled_func_)(bool){nullptr};
-    void (*set_platform_l2_perf_base_func_)(uint64_t){nullptr};
+    void (*set_platform_l2_swimlane_base_func_)(uint64_t){nullptr};
     void (*set_platform_aicore_rotation_table_func_)(uint64_t){nullptr};
     void (*set_l2_swimlane_enabled_func_)(bool){nullptr};
     void (*set_platform_pmu_base_func_)(uint64_t){nullptr};
@@ -395,7 +395,7 @@ class DeviceRunner {
     std::string aicore_so_path_;
 
     // Performance profiling
-    L2PerfCollector l2_perf_collector_;
+    L2SwimlaneCollector l2_swimlane_collector_;
 
     // Tensor dump (independent shared memory + memory manager)
     TensorDumpCollector dump_collector_;
@@ -429,7 +429,7 @@ class DeviceRunner {
      * @param device_id Device ID (ignored in simulation)
      * @return 0 on success, error code on failure
      */
-    int init_l2_perf(int num_aicore, int device_id);
+    int init_l2_swimlane(int num_aicore, int device_id);
 
     int init_tensor_dump(Runtime &runtime, int device_id);
 
@@ -456,9 +456,9 @@ class DeviceRunner {
     bool enable_pmu_{false};
     bool enable_dep_gen_{false};
     bool enable_scope_stats_{false};
-    L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};             // resolved from set_l2_swimlane_enabled()
-    PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};  // resolved from set_pmu_enabled()
-    std::string output_prefix_{};                                  // diagnostic artifact root directory
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};  // resolved from set_l2_swimlane_enabled()
+    PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};   // resolved from set_pmu_enabled()
+    std::string output_prefix_{};                                   // diagnostic artifact root directory
 };
 
 #endif  // SRC_A2A3_PLATFORM_SIM_HOST_DEVICE_RUNNER_H_
diff --git a/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp
similarity index 68%
rename from src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp
rename to src/a2a3/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp
index 7ab5b7498..af44aced2 100644
--- a/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp
+++ b/src/a2a3/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp
@@ -10,15 +10,15 @@
  */
 
 /**
- * @file l2_perf_collector_aicpu.cpp
+ * @file l2_swimlane_collector_aicpu.cpp
  * @brief AICPU performance data collection implementation (SPSC free queue)
  *
- * Uses per-core L2PerfBufferState with SPSC free queues for O(1) buffer switching.
+ * Uses per-core L2SwimlaneAicpuTaskPool with SPSC free queues for O(1) buffer switching.
  * Host memory manager dynamically allocates replacement buffers and pushes
  * them into the free_queue. Device pops from free_queue when switching.
  */
 
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 
 #include <cinttypes>
 #include <cstring>
@@ -29,79 +29,83 @@
 #include "common/unified_log.h"
 
 // Cached pointers for hot-path access (set during init)
-static AicpuPhaseHeader *s_phase_header = nullptr;
-static L2PerfDataHeader *s_l2_perf_header = nullptr;
+static L2SwimlaneAicpuPhaseHeader *s_l2_swimlane_aicpu_phase_header = nullptr;
+static L2SwimlaneDataHeader *s_l2_swimlane_header = nullptr;
 
-// Per-core L2PerfBufferState cache
-static L2PerfBufferState *s_perf_buffer_states[PLATFORM_MAX_CORES] = {};
+// Per-core L2SwimlaneAicpuTaskPool cache
+static L2SwimlaneAicpuTaskPool *s_aicpu_task_pools[PLATFORM_MAX_CORES] = {};
 
-// Per-core L2PerfAicoreBufferState cache (lives in the same shared region;
+// Per-core L2SwimlaneAicoreTaskPool cache (lives in the same shared region;
 // host writes initial pool + the rotation channel that AICore polls).
 //
 // All AICore-side bookkeeping (rotation channel, free queue,
 // total_record_count, current_buf_seq) is owned by this shared struct — see
-// l2_perf_profiling.h. We deliberately do not keep AICPU-process-local
+// l2_swimlane_profiling.h. We deliberately do not keep AICPU-process-local
 // mirror counters because the struct's volatile fields are the single
 // source of truth across init/complete/rotate/flush. The high-water-mark
 // formula `total_record_count - current_buf_seq * BUFFER_SIZE` correctly
 // handles the failed-rotation case (free_queue empty or ready_queue full)
 // since current_buf_seq only bumps on a successful rotation.
-static L2PerfAicoreBufferState *s_aicore_buffer_states[PLATFORM_MAX_CORES] = {};
+static L2SwimlaneAicoreTaskPool *s_aicore_task_pools[PLATFORM_MAX_CORES] = {};
 
 // Per-core cached current-records-buffer pointer. Written by AICPU when
 // rotating buffers from inside `complete_record`. AICore writes to its own
-// per-core L2PerfAicoreBuffer (host-allocated, AICPU rotates) and AICPU
+// per-core L2SwimlaneAicoreTaskBuffer (host-allocated, AICPU rotates) and AICPU
 // never reads from it on the hot path.
-static L2PerfBuffer *s_perf_records_buffers[PLATFORM_MAX_CORES] = {};
+static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORES] = {};
 
-// Per-thread PhaseBufferState cache
-static PhaseBufferState *s_phase_buffer_states[PLATFORM_MAX_AICPU_THREADS] = {};
-static PhaseBuffer *s_current_phase_buf[PLATFORM_MAX_AICPU_THREADS] = {};
+// Per-thread L2SwimlaneAicpuPhasePool cache
+static L2SwimlaneAicpuPhasePool *s_aicpu_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
+static L2SwimlaneAicpuPhaseBuffer *s_current_aicpu_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};
 
 static int s_orch_thread_idx = -1;
 
-// L2 perf platform state. Published by the host (via dlsym'd setters on sim)
+// L2 swimlane platform state. Published by the host (via dlsym'd setters on sim)
 // or by the AICPU kernel entry (onboard) before perf init runs, so downstream
 // perf code can discover enablement + device-base without reading the generic
 // Runtime struct. Two channels (mirrors PMU):
 //   - g_enable_l2_swimlane (bool) — set at kernel entry from the bitmask bit
-//   - g_l2_perf_level (L2PerfLevel) — promoted in
-//     l2_perf_aicpu_init from the shared-memory header so
+//   - g_l2_swimlane_level (L2SwimlaneLevel) — promoted in
+//     l2_swimlane_aicpu_init from the shared-memory header so
 //     `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates have the granular
-//     value (exposed via get_l2_perf_level()).
-static uint64_t g_platform_l2_perf_base = 0;
+//     value (exposed via get_l2_swimlane_level()).
+static uint64_t g_platform_l2_swimlane_base = 0;
 static bool g_enable_l2_swimlane = false;
-static L2PerfLevel g_l2_perf_level = L2PerfLevel::DISABLED;
+static L2SwimlaneLevel g_l2_swimlane_level = L2SwimlaneLevel::DISABLED;
 
-// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr).
+// AICore rotation-table device pointer (= KernelArgs::l2_swimlane_aicore_rotation_table).
 // Published by the host (sim: dlsym'd setter; onboard: from k_args via the
 // kernel entry); AICPU init walks it to fill per-core &rotation addresses.
-static uint64_t g_platform_aicore_rotation_table = 0;
+static uint64_t g_platform_l2_swimlane_aicore_rotation_table = 0;
 
-extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base) { g_platform_l2_perf_base = l2_perf_data_base; }
-extern "C" uint64_t get_platform_l2_perf_base() { return g_platform_l2_perf_base; }
+extern "C" void set_platform_l2_swimlane_base(uint64_t l2_swimlane_data_base) {
+    g_platform_l2_swimlane_base = l2_swimlane_data_base;
+}
+extern "C" uint64_t get_platform_l2_swimlane_base() { return g_platform_l2_swimlane_base; }
 extern "C" void set_l2_swimlane_enabled(bool enable) { g_enable_l2_swimlane = enable; }
 extern "C" bool is_l2_swimlane_enabled() { return g_enable_l2_swimlane; }
-extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr) {
-    g_platform_aicore_rotation_table = table_addr;
+extern "C" void set_platform_l2_swimlane_aicore_rotation_table(uint64_t table_addr) {
+    g_platform_l2_swimlane_aicore_rotation_table = table_addr;
+}
+extern "C" uint64_t get_platform_l2_swimlane_aicore_rotation_table() {
+    return g_platform_l2_swimlane_aicore_rotation_table;
 }
-extern "C" uint64_t get_platform_aicore_rotation_table() { return g_platform_aicore_rotation_table; }
-L2PerfLevel get_l2_perf_level() { return g_l2_perf_level; }
+L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; }
 
 /**
  * Enqueue ready buffer to per-thread queue
  *
- * @param header L2PerfDataHeader pointer
+ * @param header L2SwimlaneDataHeader pointer
  * @param thread_idx Thread index
  * @param core_index Core index (or thread_idx for phase entries)
  * @param buffer_ptr Device pointer to the full buffer
  * @param buffer_seq Sequence number for ordering
- * @param is_phase 0 = L2PerfRecord, 1 = Phase
+ * @param kind Buffer kind discriminator (see L2SwimlaneBufferKind)
  * @return 0 on success, -1 if queue full
  */
 static int enqueue_ready_buffer(
-    L2PerfDataHeader *header, int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq,
-    uint32_t is_phase
+    L2SwimlaneDataHeader *header, int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq,
+    L2SwimlaneBufferKind kind
 ) {
     uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE;
     uint32_t current_tail = header->queue_tails[thread_idx];
@@ -114,7 +118,7 @@ static int enqueue_ready_buffer(
     }
 
     header->queues[thread_idx][current_tail].core_index = core_index;
-    header->queues[thread_idx][current_tail].is_phase = is_phase;
+    header->queues[thread_idx][current_tail].kind = kind;
     header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr;
     header->queues[thread_idx][current_tail].buffer_seq = buffer_seq;
     header->queue_tails[thread_idx] = next_tail;
@@ -122,42 +126,42 @@ static int enqueue_ready_buffer(
     return 0;
 }
 
-void l2_perf_aicpu_init(int worker_count) {
-    void *l2_perf_base = reinterpret_cast<void *>(g_platform_l2_perf_base);
-    if (l2_perf_base == nullptr) {
-        LOG_ERROR("l2_perf_data_base is NULL, cannot initialize profiling");
+void l2_swimlane_aicpu_init(int worker_count) {
+    void *l2_swimlane_base = reinterpret_cast<void *>(g_platform_l2_swimlane_base);
+    if (l2_swimlane_base == nullptr) {
+        LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize profiling");
         return;
     }
 
-    s_l2_perf_header = get_l2_perf_header(l2_perf_base);
+    s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base);
 
     // Read the granular perf_level from the shared-memory header (host wrote
-    // it in L2PerfCollector::initialize). The kernel-entry setter only seeded
+    // it in L2SwimlaneCollector::initialize). The kernel-entry setter only seeded
     // the binary g_enable_l2_swimlane via the bitmask bit.
-    g_l2_perf_level = static_cast<L2PerfLevel>(s_l2_perf_header->l2_perf_level);
+    g_l2_swimlane_level = static_cast<L2SwimlaneLevel>(s_l2_swimlane_header->l2_swimlane_level);
 
     LOG_INFO_V0(
-        "Initializing performance profiling for %d cores (free queue), l2_perf_level=%u", worker_count,
-        static_cast<uint32_t>(g_l2_perf_level)
+        "Initializing performance profiling for %d cores (free queue), l2_swimlane_level=%u", worker_count,
+        static_cast<uint32_t>(g_l2_swimlane_level)
     );
 
-    // Populate the per-core AicoreRotation device-address table. AICore reads
-    // `aicore_ring_addr[block_idx]` from KernelArgs to find its rotation
+    // Populate the per-core L2SwimlaneAicoreRotation device-address table. AICore reads
+    // `l2_swimlane_aicore_rotation_table[block_idx]` from KernelArgs to find its rotation
     // channel; the table itself is host-allocated, but the entries are
     // device-internal addresses (`&ac_state->rotation`) that the host would
     // otherwise have to translate from host-mapped to device-mapped. AICPU
     // already runs on the device, so it can write the addresses directly
     // without any translation — that keeps the host side decoupled from the
     // AICore shared-memory layout.
-    uint64_t *rotation_table = reinterpret_cast<uint64_t *>(g_platform_aicore_rotation_table);
+    uint64_t *rotation_table = reinterpret_cast<uint64_t *>(g_platform_l2_swimlane_aicore_rotation_table);
 
     // Pop first buffer from free_queue for each core
     for (int i = 0; i < worker_count; i++) {
-        L2PerfBufferState *state = get_perf_buffer_state(l2_perf_base, i);
-        L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(l2_perf_base, worker_count, i);
+        L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(l2_swimlane_base, i);
+        L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(l2_swimlane_base, worker_count, i);
 
-        s_perf_buffer_states[i] = state;
-        s_aicore_buffer_states[i] = ac_state;
+        s_aicpu_task_pools[i] = state;
+        s_aicore_task_pools[i] = ac_state;
 
         if (rotation_table != nullptr) {
             rotation_table[i] = reinterpret_cast<uint64_t>(&ac_state->rotation);
@@ -176,15 +180,15 @@ void l2_perf_aicpu_init(int worker_count) {
             state->current_buf_seq = 0;
             wmb();
 
-            L2PerfBuffer *buf = reinterpret_cast<L2PerfBuffer *>(buf_ptr);
+            L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(buf_ptr);
             buf->count = 0;
-            s_perf_records_buffers[i] = buf;
+            s_current_aicpu_task_buffers[i] = buf;
 
             LOG_DEBUG("Core %d: popped initial buffer (addr=0x%lx)", i, buf_ptr);
         } else {
             LOG_ERROR("Core %d: free_queue is empty during init!", i);
             state->current_buf_ptr = 0;
-            s_perf_records_buffers[i] = nullptr;
+            s_current_aicpu_task_buffers[i] = nullptr;
         }
 
         // Prime the AICore rotation channel with the initial buffer.
@@ -201,7 +205,7 @@ void l2_perf_aicpu_init(int worker_count) {
             ac_state->rotation.current_buf_ptr = ac_buf_ptr;
             ac_state->rotation.generation = 1;
             wmb();
-            L2PerfAicoreBuffer *ac_buf = reinterpret_cast<L2PerfAicoreBuffer *>(ac_buf_ptr);
+            L2SwimlaneAicoreTaskBuffer *ac_buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(ac_buf_ptr);
             ac_buf->count = 0;
             LOG_DEBUG("Core %d: primed AICore rotation with buf=0x%lx, gen=1", i, ac_buf_ptr);
         } else {
@@ -218,18 +222,18 @@ void l2_perf_aicpu_init(int worker_count) {
 }
 
 /**
- * Internal records-buffer rotation. Called from `l2_perf_aicpu_complete_record`
+ * Internal records-buffer rotation. Called from `l2_swimlane_aicpu_complete_task`
  * after a record is committed and the buffer hits capacity. Only swaps an
  * AICPU-private records pointer — AICore reads from a stable ring and is
  * unaffected by this call.
  */
 static void switch_records_buffer(int core_id, int thread_idx) {
-    L2PerfBufferState *state = s_perf_buffer_states[core_id];
+    L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id];
     if (state == nullptr) {
         return;
     }
 
-    L2PerfBuffer *full_buf = s_perf_records_buffers[core_id];
+    L2SwimlaneAicpuTaskBuffer *full_buf = s_current_aicpu_task_buffers[core_id];
     if (full_buf == nullptr) {
         return;
     }
@@ -252,7 +256,9 @@ static void switch_records_buffer(int core_id, int thread_idx) {
 
     // Enqueue full buffer to ReadyQueue
     uint32_t seq = state->current_buf_seq;
-    int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, state->current_buf_ptr, seq, 0);
+    int rc = enqueue_ready_buffer(
+        s_l2_swimlane_header, thread_idx, core_id, state->current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask
+    );
     if (rc != 0) {
         LOG_ERROR("Thread %d: Core %d failed to enqueue buffer (queue full), data lost!", thread_idx, core_id);
         // Revert: discard data and keep writing
@@ -270,21 +276,21 @@ static void switch_records_buffer(int core_id, int thread_idx) {
     state->current_buf_seq = seq + 1;
     wmb();
 
-    L2PerfBuffer *new_buf = reinterpret_cast<L2PerfBuffer *>(new_buf_ptr);
+    L2SwimlaneAicpuTaskBuffer *new_buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(new_buf_ptr);
     new_buf->count = 0;
-    s_perf_records_buffers[core_id] = new_buf;
+    s_current_aicpu_task_buffers[core_id] = new_buf;
 
     LOG_INFO_V0("Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr);
 }
 
 // Try to rotate the AICore buffer for `core_id`. Called from the completion
-// path after a successful L2PerfRecord commit so the just-FIN'd task's
+// path after a successful L2SwimlaneAicpuTaskRecord commit so the just-FIN'd task's
 // AICore record is guaranteed to be in the old buffer before we enqueue it.
 // On success bumps `ac_state->current_buf_seq`; on failure (empty free queue
 // or full ready queue) the old buffer is abandoned in place, AICore overflows
 // it from now on, and the drop count grows.
 static void aicore_rotate(int core_id, int thread_idx) {
-    L2PerfAicoreBufferState *ac_state = s_aicore_buffer_states[core_id];
+    L2SwimlaneAicoreTaskPool *ac_state = s_aicore_task_pools[core_id];
     if (ac_state == nullptr) {
         return;
     }
@@ -312,10 +318,12 @@ static void aicore_rotate(int core_id, int thread_idx) {
 
     // Enqueue the just-filled AICore buffer with count = BUFFER_SIZE.
     if (old_buf_ptr != 0) {
-        L2PerfAicoreBuffer *old_buf = reinterpret_cast<L2PerfAicoreBuffer *>(old_buf_ptr);
+        L2SwimlaneAicoreTaskBuffer *old_buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(old_buf_ptr);
         old_buf->count = static_cast<uint32_t>(PLATFORM_AICORE_BUFFER_SIZE);
         wmb();
-        int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, old_buf_ptr, seq, /*is_phase=*/2);
+        int rc = enqueue_ready_buffer(
+            s_l2_swimlane_header, thread_idx, core_id, old_buf_ptr, seq, L2SwimlaneBufferKind::AicoreTask
+        );
         if (rc != 0) {
             LOG_ERROR(
                 "Thread %d: Core %d failed to enqueue AICore buffer (queue full), %d records lost", thread_idx, core_id,
@@ -332,7 +340,7 @@ static void aicore_rotate(int core_id, int thread_idx) {
     rmb();
     ac_state->free_queue.head = head + 1;
     ac_state->current_buf_seq = seq + 1;
-    L2PerfAicoreBuffer *new_buf = reinterpret_cast<L2PerfAicoreBuffer *>(new_buf_ptr);
+    L2SwimlaneAicoreTaskBuffer *new_buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(new_buf_ptr);
     new_buf->count = 0;
 
     wmb();  // ensure new_buf->count=0 visible before AICore sees new ptr
@@ -342,18 +350,18 @@ static void aicore_rotate(int core_id, int thread_idx) {
 }
 
 // Public no-op shim kept so callers compile during the cross-runtime
-// transition; the rotation has been moved into l2_perf_aicpu_complete_record
+// transition; the rotation has been moved into l2_swimlane_aicpu_complete_task
 // where it is race-free vs in-flight AICore record writes.
-void l2_perf_aicpu_maybe_rotate_aicore(int /*core_id*/, int /*thread_idx*/) {}
+void l2_swimlane_aicpu_maybe_rotate_aicore(int /*core_id*/, int /*thread_idx*/) {}
 
-int l2_perf_aicpu_complete_record(
+int l2_swimlane_aicpu_complete_task(
     int core_id, int thread_idx, uint32_t expected_reg_task_id, uint64_t task_id, uint32_t func_id, CoreType core_type,
     uint64_t dispatch_time, uint64_t finish_time
 ) {
     if (core_id < 0 || core_id >= PLATFORM_MAX_CORES) {
         return -1;
     }
-    L2PerfBufferState *state = s_perf_buffer_states[core_id];
+    L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id];
     if (state == nullptr) {
         return -1;
     }
@@ -362,14 +370,14 @@ int l2_perf_aicpu_complete_record(
     // `device_total - (collected + dropped)`.
     state->total_record_count += 1;
 
-    L2PerfBuffer *l2_perf_buf = s_perf_records_buffers[core_id];
-    if (l2_perf_buf == nullptr) {
+    L2SwimlaneAicpuTaskBuffer *l2_swimlane_buf = s_current_aicpu_task_buffers[core_id];
+    if (l2_swimlane_buf == nullptr) {
         // No active records buffer (init ran out of free buffers); count as drop
         // so host reconciliation stays consistent.
         state->dropped_record_count += 1;
         return -1;
     }
-    uint32_t count = l2_perf_buf->count;
+    uint32_t count = l2_swimlane_buf->count;
     if (count >= PLATFORM_PROF_BUFFER_SIZE) {
         // Defensive: should not happen because we rotate at end of every commit.
         state->dropped_record_count += 1;
@@ -377,14 +385,14 @@ int l2_perf_aicpu_complete_record(
     }
 
     // AICore-as-producer: AICore writes start/end/task_id directly into its
-    // own per-core L2PerfAicoreBuffer (indexed by reg_task_id % SIZE). AICPU
+    // own per-core L2SwimlaneAicoreTaskBuffer (indexed by reg_task_id % SIZE). AICPU
     // writes only AICPU-owned fields here; start/end stay zero on-device and
     // are patched by the host when the buffer is consumed. Join key is
     // `reg_task_id` (monotonic per core), stored alongside the PTO2-encoded
     // `task_id` so the host can match without a hashmap lookup. This
     // eliminates the per-task rmb() + staging cache-line read the previous
     // design required.
-    L2PerfRecord *record = &l2_perf_buf->records[count];
+    L2SwimlaneAicpuTaskRecord *record = &l2_swimlane_buf->records[count];
     record->start_time = 0;
     record->end_time = 0;
     record->duration = 0;
@@ -394,7 +402,7 @@ int l2_perf_aicpu_complete_record(
     record->core_type = core_type;
 
     // AICPU_TIMING and above: dispatch/finish timing.
-    if (g_l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+    if (g_l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
         record->dispatch_time = dispatch_time;
         record->finish_time = finish_time;
     } else {
@@ -403,10 +411,10 @@ int l2_perf_aicpu_complete_record(
     }
 
     uint32_t new_count = count + 1;
-    l2_perf_buf->count = new_count;
+    l2_swimlane_buf->count = new_count;
     wmb();
 
-    // Rotate AICpu's L2PerfBuffer after the write so the just-committed
+    // Rotate AICpu's L2SwimlaneAicpuTaskBuffer after the write so the just-committed
     // record is preserved.
     if (new_count >= PLATFORM_PROF_BUFFER_SIZE) {
         switch_records_buffer(core_id, thread_idx);
@@ -435,7 +443,7 @@ int l2_perf_aicpu_complete_record(
     // total_record_count is uint32_t — wraps after ~4 G completions per core.
     // At realistic dispatch rates this is multi-week continuous-run territory;
     // we accept the limitation rather than widening the on-device counter.
-    L2PerfAicoreBufferState *ac_state = s_aicore_buffer_states[core_id];
+    L2SwimlaneAicoreTaskPool *ac_state = s_aicore_task_pools[core_id];
     if (ac_state != nullptr) {
         uint32_t completed = ac_state->total_record_count + 1;
         ac_state->total_record_count = completed;
@@ -447,13 +455,13 @@ int l2_perf_aicpu_complete_record(
     return 0;
 }
 
-void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, int core_num) {
+void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int core_num) {
     if (!g_enable_l2_swimlane) {
         return;
     }
 
-    void *l2_perf_base = reinterpret_cast<void *>(g_platform_l2_perf_base);
-    if (l2_perf_base == nullptr) {
+    void *l2_swimlane_base = reinterpret_cast<void *>(g_platform_l2_swimlane_base);
+    if (l2_swimlane_base == nullptr) {
         return;
     }
 
@@ -465,7 +473,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
 
     for (int i = 0; i < core_num; i++) {
         int core_id = cur_thread_cores[i];
-        L2PerfBufferState *state = s_perf_buffer_states[core_id];
+        L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id];
         if (state == nullptr) continue;
 
         rmb();
@@ -473,15 +481,17 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
         if (buf_ptr == 0) {
             // No active buffer
         } else {
-            L2PerfBuffer *buf = reinterpret_cast<L2PerfBuffer *>(buf_ptr);
+            L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(buf_ptr);
             if (buf->count > 0) {
                 uint32_t seq = state->current_buf_seq;
-                int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, buf_ptr, seq, 0);
+                int rc = enqueue_ready_buffer(
+                    s_l2_swimlane_header, thread_idx, core_id, buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask
+                );
                 if (rc == 0) {
                     LOG_INFO_V0("Thread %d: Core %d flushed buffer with %u records", thread_idx, core_id, buf->count);
                     flushed_count++;
                     state->current_buf_ptr = 0;
-                    s_perf_records_buffers[core_id] = nullptr;
+                    s_current_aicpu_task_buffers[core_id] = nullptr;
                     wmb();
                 } else {
                     // ready_queue full at end-of-run: account the loss and clear the
@@ -494,7 +504,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
                     state->dropped_record_count = state->dropped_record_count + buf->count;
                     buf->count = 0;
                     state->current_buf_ptr = 0;
-                    s_perf_records_buffers[core_id] = nullptr;
+                    s_current_aicpu_task_buffers[core_id] = nullptr;
                     wmb();
                 }
             }
@@ -510,7 +520,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
         // unknown number of dropped overflow attempts; the formula clamps
         // to BUFFER_SIZE in that case rather than stamping a stale partial
         // count.
-        L2PerfAicoreBufferState *ac_state = s_aicore_buffer_states[core_id];
+        L2SwimlaneAicoreTaskPool *ac_state = s_aicore_task_pools[core_id];
         if (ac_state == nullptr) continue;
 
         rmb();
@@ -527,12 +537,14 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
         uint32_t ac_mark = (live > static_cast<uint32_t>(PLATFORM_AICORE_BUFFER_SIZE)) ?
                                static_cast<uint32_t>(PLATFORM_AICORE_BUFFER_SIZE) :
                                live;
-        L2PerfAicoreBuffer *ac_buf = reinterpret_cast<L2PerfAicoreBuffer *>(ac_buf_ptr);
+        L2SwimlaneAicoreTaskBuffer *ac_buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(ac_buf_ptr);
         ac_buf->count = ac_mark;
         wmb();
 
         uint32_t ac_seq = ac_state->current_buf_seq;
-        int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, ac_buf_ptr, ac_seq, /*is_phase=*/2);
+        int rc = enqueue_ready_buffer(
+            s_l2_swimlane_header, thread_idx, core_id, ac_buf_ptr, ac_seq, L2SwimlaneBufferKind::AicoreTask
+        );
         if (rc == 0) {
             LOG_INFO_V0(
                 "Thread %d: Core %d flushed AICore buffer (seq=%u, count=%u)", thread_idx, core_id, ac_seq, ac_mark
@@ -552,22 +564,24 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
     LOG_INFO_V0("Thread %d: Performance buffer flush complete, %d buffers flushed", thread_idx, flushed_count);
 }
 
-void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) {
-    void *l2_perf_base = reinterpret_cast<void *>(g_platform_l2_perf_base);
-    if (l2_perf_base == nullptr) {
-        LOG_ERROR("l2_perf_data_base is NULL, cannot initialize phase profiling");
+void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads) {
+    void *l2_swimlane_base = reinterpret_cast<void *>(g_platform_l2_swimlane_base);
+    if (l2_swimlane_base == nullptr) {
+        LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize phase profiling");
         return;
     }
 
-    s_phase_header = get_phase_header(l2_perf_base, worker_count);
-    s_l2_perf_header = get_l2_perf_header(l2_perf_base);
+    s_l2_swimlane_aicpu_phase_header = get_phase_header(l2_swimlane_base, worker_count);
+    s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base);
 
-    s_phase_header->magic = AICPU_PHASE_MAGIC;
-    s_phase_header->num_sched_threads = num_sched_threads;
-    s_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD;
-    s_phase_header->num_cores = 0;
+    s_l2_swimlane_aicpu_phase_header->magic = L2_SWIMLANE_AICPU_PHASE_MAGIC;
+    s_l2_swimlane_aicpu_phase_header->num_sched_threads = num_sched_threads;
+    s_l2_swimlane_aicpu_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD;
+    s_l2_swimlane_aicpu_phase_header->num_cores = 0;
 
-    memset(s_phase_header->core_to_thread, -1, sizeof(s_phase_header->core_to_thread));
+    memset(
+        s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread)
+    );
 
     // Cache per-thread record pointers and clear buffers
     // Include all threads: scheduler + orchestrator (orchestrators may become schedulers)
@@ -576,9 +590,9 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) {
         total_threads = PLATFORM_MAX_AICPU_THREADS;
     }
     for (int t = 0; t < total_threads; t++) {
-        PhaseBufferState *state = get_phase_buffer_state(l2_perf_base, worker_count, t);
+        L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(l2_swimlane_base, worker_count, t);
 
-        s_phase_buffer_states[t] = state;
+        s_aicpu_phase_pools[t] = state;
 
         // Pop first buffer from free_queue
         rmb();
@@ -593,22 +607,22 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) {
             state->current_buf_seq = 0;
             wmb();
 
-            PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(buf_ptr);
+            L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(buf_ptr);
             buf->count = 0;
-            s_current_phase_buf[t] = buf;
+            s_current_aicpu_phase_buffers[t] = buf;
 
             LOG_DEBUG("Thread %d: popped initial phase buffer (addr=0x%lx)", t, buf_ptr);
         } else {
             LOG_ERROR("Thread %d: phase free_queue is empty during init!", t);
             state->current_buf_ptr = 0;
-            s_current_phase_buf[t] = nullptr;
+            s_current_aicpu_phase_buffers[t] = nullptr;
         }
     }
 
     // Clear remaining slots
     for (int t = total_threads; t < PLATFORM_MAX_AICPU_THREADS; t++) {
-        s_phase_buffer_states[t] = nullptr;
-        s_current_phase_buf[t] = nullptr;
+        s_aicpu_phase_pools[t] = nullptr;
+        s_current_aicpu_phase_buffers[t] = nullptr;
     }
 
     wmb();
@@ -623,28 +637,30 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) {
  * Switch phase buffer when current buffer is full (free queue version)
  *
  * Enqueues the full buffer to ReadyQueue and pops the next buffer from free_queue.
- * If no free buffer is available, sets s_current_phase_buf to nullptr so subsequent
+ * If no free buffer is available, sets s_current_aicpu_phase_buffers to nullptr so subsequent
  * records are dropped (preserving already-enqueued data).
  */
 static void switch_phase_buffer(int thread_idx) {
-    PhaseBufferState *state = s_phase_buffer_states[thread_idx];
+    L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx];
     if (state == nullptr) return;
 
-    PhaseBuffer *full_buf = s_current_phase_buf[thread_idx];
+    L2SwimlaneAicpuPhaseBuffer *full_buf = s_current_aicpu_phase_buffers[thread_idx];
     if (full_buf == nullptr) return;
 
     LOG_INFO_V0("Thread %d: phase buffer is full (count=%u)", thread_idx, full_buf->count);
 
     // Enqueue to ReadyQueue
     uint32_t seq = state->current_buf_seq;
-    int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, thread_idx, state->current_buf_ptr, seq, 1);
+    int rc = enqueue_ready_buffer(
+        s_l2_swimlane_header, thread_idx, thread_idx, state->current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuPhase
+    );
     if (rc != 0) {
         LOG_ERROR(
             "Thread %d: failed to enqueue phase buffer (queue full), %u records lost!", thread_idx, full_buf->count
         );
         state->dropped_record_count = state->dropped_record_count + full_buf->count;
         full_buf->count = 0;
-        s_current_phase_buf[thread_idx] = nullptr;
+        s_current_aicpu_phase_buffers[thread_idx] = nullptr;
         state->current_buf_ptr = 0;
         wmb();
         return;
@@ -663,29 +679,29 @@ static void switch_phase_buffer(int thread_idx) {
         state->current_buf_seq = seq + 1;
         wmb();
 
-        PhaseBuffer *new_buf = reinterpret_cast<PhaseBuffer *>(new_buf_ptr);
+        L2SwimlaneAicpuPhaseBuffer *new_buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(new_buf_ptr);
         new_buf->count = 0;
-        s_current_phase_buf[thread_idx] = new_buf;
+        s_current_aicpu_phase_buffers[thread_idx] = new_buf;
 
         LOG_INFO_V0("Thread %d: switched to new phase buffer", thread_idx);
     } else {
         // No free buffer available, drop subsequent records
         LOG_WARN("Thread %d: no free phase buffer available, dropping records until Host catches up", thread_idx);
-        s_current_phase_buf[thread_idx] = nullptr;
+        s_current_aicpu_phase_buffers[thread_idx] = nullptr;
         state->current_buf_ptr = 0;
         wmb();
     }
 }
 
-void l2_perf_aicpu_record_phase(
-    int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
+void l2_swimlane_aicpu_record_phase(
+    int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
     uint64_t tasks_processed, uint32_t extra1, uint32_t extra2
 ) {
-    if (s_phase_header == nullptr) {
+    if (s_l2_swimlane_aicpu_phase_header == nullptr) {
         return;
     }
 
-    PhaseBufferState *state = s_phase_buffer_states[thread_idx];
+    L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx];
     if (state == nullptr) {
         return;
     }
@@ -694,7 +710,7 @@ void l2_perf_aicpu_record_phase(
     // as `device_total - (collected + dropped)` (mirrors PERF accounting).
     state->total_record_count += 1;
 
-    PhaseBuffer *buf = s_current_phase_buf[thread_idx];
+    L2SwimlaneAicpuPhaseBuffer *buf = s_current_aicpu_phase_buffers[thread_idx];
 
     // Try to recover from nullptr (no buffer was available on previous switch)
     if (buf == nullptr) {
@@ -710,9 +726,9 @@ void l2_perf_aicpu_record_phase(
             state->current_buf_seq = state->current_buf_seq + 1;
             wmb();
 
-            buf = reinterpret_cast<PhaseBuffer *>(buf_ptr);
+            buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(buf_ptr);
             buf->count = 0;
-            s_current_phase_buf[thread_idx] = buf;
+            s_current_aicpu_phase_buffers[thread_idx] = buf;
 
             LOG_INFO_V0("Thread %d: recovered phase buffer", thread_idx);
         }
@@ -727,7 +743,7 @@ void l2_perf_aicpu_record_phase(
     if (idx >= PLATFORM_PHASE_RECORDS_PER_THREAD) {
         // Buffer full, switch to next buffer
         switch_phase_buffer(thread_idx);
-        buf = s_current_phase_buf[thread_idx];
+        buf = s_current_aicpu_phase_buffers[thread_idx];
         if (buf == nullptr) {
             state->dropped_record_count += 1;
             return;
@@ -739,7 +755,7 @@ void l2_perf_aicpu_record_phase(
         }
     }
 
-    AicpuPhaseRecord *record = &buf->records[idx];
+    L2SwimlaneAicpuPhaseRecord *record = &buf->records[idx];
     record->start_time = start_time;
     record->end_time = end_time;
     record->loop_iter = loop_iter;
@@ -751,21 +767,21 @@ void l2_perf_aicpu_record_phase(
     buf->count = idx + 1;
 }
 
-void l2_perf_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thread_idx; }
+void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thread_idx; }
 
-void l2_perf_aicpu_record_orch_phase(
-    AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
+void l2_swimlane_aicpu_record_orch_phase(
+    L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
 ) {
-    if (s_orch_thread_idx < 0 || s_phase_header == nullptr) return;
-    l2_perf_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id);
+    if (s_orch_thread_idx < 0 || s_l2_swimlane_aicpu_phase_header == nullptr) return;
+    l2_swimlane_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id);
 }
 
-void l2_perf_aicpu_flush_phase_buffers(int thread_idx) {
-    if (s_phase_header == nullptr || s_l2_perf_header == nullptr) {
+void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) {
+    if (s_l2_swimlane_aicpu_phase_header == nullptr || s_l2_swimlane_header == nullptr) {
         return;
     }
 
-    PhaseBufferState *state = s_phase_buffer_states[thread_idx];
+    L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx];
     if (state == nullptr) return;
 
     rmb();
@@ -775,13 +791,15 @@ void l2_perf_aicpu_flush_phase_buffers(int thread_idx) {
         return;
     }
 
-    PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(buf_ptr);
+    L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(buf_ptr);
     if (buf->count == 0) {
         return;
     }
 
     uint32_t seq = state->current_buf_seq;
-    int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, thread_idx, buf_ptr, seq, 1);
+    int rc = enqueue_ready_buffer(
+        s_l2_swimlane_header, thread_idx, thread_idx, buf_ptr, seq, L2SwimlaneBufferKind::AicpuPhase
+    );
     if (rc == 0) {
         LOG_INFO_V0("Thread %d: flushed phase buffer with %u records", thread_idx, buf->count);
     } else {
@@ -790,28 +808,30 @@ void l2_perf_aicpu_flush_phase_buffers(int thread_idx) {
         buf->count = 0;
     }
     state->current_buf_ptr = 0;
-    s_current_phase_buf[thread_idx] = nullptr;
+    s_current_aicpu_phase_buffers[thread_idx] = nullptr;
     wmb();
 }
 
-void l2_perf_aicpu_init_core_assignments(int total_cores) {
-    if (s_phase_header == nullptr) {
+void l2_swimlane_aicpu_init_core_assignments(int total_cores) {
+    if (s_l2_swimlane_aicpu_phase_header == nullptr) {
         return;
     }
-    memset(s_phase_header->core_to_thread, -1, sizeof(s_phase_header->core_to_thread));
-    s_phase_header->num_cores = static_cast<uint32_t>(total_cores);
+    memset(
+        s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread)
+    );
+    s_l2_swimlane_aicpu_phase_header->num_cores = static_cast<uint32_t>(total_cores);
     wmb();
     LOG_INFO_V0("Core-to-thread mapping init: %d cores", total_cores);
 }
 
-void l2_perf_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) {
-    if (s_phase_header == nullptr) {
+void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) {
+    if (s_l2_swimlane_aicpu_phase_header == nullptr) {
         return;
     }
     for (int i = 0; i < core_num; i++) {
         int core_id = core_ids[i];
         if (core_id >= 0 && core_id < PLATFORM_MAX_CORES) {
-            s_phase_header->core_to_thread[core_id] = static_cast<int8_t>(thread_idx);
+            s_l2_swimlane_aicpu_phase_header->core_to_thread[core_id] = static_cast<int8_t>(thread_idx);
         }
     }
     wmb();
diff --git a/src/a2a3/platform/src/host/l2_perf_collector.cpp b/src/a2a3/platform/src/host/l2_swimlane_collector.cpp
similarity index 82%
rename from src/a2a3/platform/src/host/l2_perf_collector.cpp
rename to src/a2a3/platform/src/host/l2_swimlane_collector.cpp
index 98c72a928..c06d083e1 100644
--- a/src/a2a3/platform/src/host/l2_perf_collector.cpp
+++ b/src/a2a3/platform/src/host/l2_swimlane_collector.cpp
@@ -10,15 +10,15 @@
  */
 
 /**
- * @file l2_perf_collector.cpp
+ * @file l2_swimlane_collector.cpp
  * @brief Performance data collector implementation. The mgmt-thread + buffer-pool
  *        machinery lives in profiling_common::BufferPoolManager parameterized by
- *        L2PerfModule (host/l2_perf_collector.h); the poll loop lives in
+ *        L2SwimlaneModule (host/l2_swimlane_collector.h); the poll loop lives in
  *        profiling_common::ProfilerBase. This file owns the per-buffer
  *        on_buffer_collected callback and the export logic.
  */
 
-#include "host/l2_perf_collector.h"
+#include "host/l2_swimlane_collector.h"
 
 #include <algorithm>
 #include <chrono>
@@ -36,7 +36,7 @@
 #include "common/unified_log.h"
 
 // =============================================================================
-// L2PerfCollector Implementation
+// L2SwimlaneCollector Implementation
 // =============================================================================
 
 /**
@@ -57,16 +57,16 @@
  * because idle is reconstructed from record gaps.
  */
 static constexpr uint32_t kAicpuOrchPhaseIdBase = 16;
-static bool is_scheduler_phase(AicpuPhaseId id) { return static_cast<uint32_t>(id) < kAicpuOrchPhaseIdBase; }
+static bool is_scheduler_phase(L2SwimlaneAicpuPhaseId id) { return static_cast<uint32_t>(id) < kAicpuOrchPhaseIdBase; }
 
-L2PerfCollector::~L2PerfCollector() {
+L2SwimlaneCollector::~L2SwimlaneCollector() {
     stop();
     if (shm_host_ != nullptr) {
-        LOG_WARN("L2PerfCollector destroyed without finalize()");
+        LOG_WARN("L2SwimlaneCollector destroyed without finalize()");
     }
 }
 
-void *L2PerfCollector::alloc_single_buffer(size_t size, void **host_ptr_out) {
+void *L2SwimlaneCollector::alloc_single_buffer(size_t size, void **host_ptr_out) {
     void *dev_ptr = alloc_cb_(size);
     if (dev_ptr == nullptr) {
         LOG_ERROR("Failed to allocate buffer (%zu bytes)", size);
@@ -92,12 +92,12 @@ void *L2PerfCollector::alloc_single_buffer(size_t size, void **host_ptr_out) {
     return dev_ptr;
 }
 
-int L2PerfCollector::initialize(
-    int num_aicore, int device_id, L2PerfLevel l2_perf_level, const L2PerfAllocCallback &alloc_cb,
-    L2PerfRegisterCallback register_cb, const L2PerfFreeCallback &free_cb, const std::string &output_prefix
+int L2SwimlaneCollector::initialize(
+    int num_aicore, int device_id, L2SwimlaneLevel l2_swimlane_level, const L2SwimlaneAllocCallback &alloc_cb,
+    L2SwimlaneRegisterCallback register_cb, const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix
 ) {
     if (shm_host_ != nullptr) {
-        LOG_ERROR("L2PerfCollector already initialized");
+        LOG_ERROR("L2SwimlaneCollector already initialized");
         return -1;
     }
 
@@ -109,7 +109,7 @@ int L2PerfCollector::initialize(
     }
 
     num_aicore_ = num_aicore;
-    l2_perf_level_ = l2_perf_level;
+    l2_swimlane_level_ = l2_swimlane_level;
     output_prefix_ = output_prefix;
     total_perf_collected_ = 0;
     total_phase_collected_ = 0;
@@ -126,9 +126,9 @@ int L2PerfCollector::initialize(
 
     LOG_DEBUG("Shared memory allocation plan:");
     LOG_DEBUG("  Number of cores:      %d", num_aicore);
-    LOG_DEBUG("  Header size:          %zu bytes", sizeof(L2PerfDataHeader));
-    LOG_DEBUG("  L2PerfBufferState size: %zu bytes each", sizeof(L2PerfBufferState));
-    LOG_DEBUG("  PhaseBufferState size:%zu bytes each", sizeof(PhaseBufferState));
+    LOG_DEBUG("  Header size:          %zu bytes", sizeof(L2SwimlaneDataHeader));
+    LOG_DEBUG("  L2SwimlaneAicpuTaskPool size: %zu bytes each", sizeof(L2SwimlaneAicpuTaskPool));
+    LOG_DEBUG("  L2SwimlaneAicpuPhasePool size:%zu bytes each", sizeof(L2SwimlaneAicpuPhasePool));
     LOG_DEBUG("  Total shared memory:  %zu bytes (%zu KB)", total_size, total_size / 1024);
 
     // Step 2: Allocate shared memory for slot arrays
@@ -158,7 +158,7 @@ int L2PerfCollector::initialize(
     }
 
     // Step 4: Initialize header
-    L2PerfDataHeader *header = get_l2_perf_header(perf_host_ptr);
+    L2SwimlaneDataHeader *header = get_l2_swimlane_header(perf_host_ptr);
 
     for (int t = 0; t < PLATFORM_MAX_AICPU_THREADS; t++) {
         memset(header->queues[t], 0, sizeof(header->queues[t]));
@@ -167,18 +167,18 @@ int L2PerfCollector::initialize(
     }
 
     header->num_cores = num_aicore;
-    header->l2_perf_level = static_cast<uint32_t>(l2_perf_level_);
+    header->l2_swimlane_level = static_cast<uint32_t>(l2_swimlane_level_);
 
-    LOG_DEBUG("Initialized L2PerfDataHeader:");
+    LOG_DEBUG("Initialized L2SwimlaneDataHeader:");
     LOG_DEBUG("  num_cores:              %d", header->num_cores);
-    LOG_DEBUG("  l2_perf_level: %u", header->l2_perf_level);
+    LOG_DEBUG("  l2_swimlane_level: %u", header->l2_swimlane_level);
     LOG_DEBUG("  buffer_capacity:        %d", PLATFORM_PROF_BUFFER_SIZE);
     LOG_DEBUG("  queue capacity:         %d", PLATFORM_PROF_READYQUEUE_SIZE);
 
-    // Step 5: Initialize L2PerfBufferStates — 1 buffer per core in free_queue, rest to recycled pool
+    // Step 5: Initialize L2SwimlaneAicpuTaskPools — 1 buffer per core in free_queue, rest to recycled pool
     for (int i = 0; i < num_aicore; i++) {
-        L2PerfBufferState *state = get_perf_buffer_state(perf_host_ptr, i);
-        memset(state, 0, sizeof(L2PerfBufferState));
+        L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(perf_host_ptr, i);
+        memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool));
 
         state->free_queue.head = 0;
         state->free_queue.tail = 0;
@@ -187,19 +187,19 @@ int L2PerfCollector::initialize(
 
         for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_CORE; s++) {
             void *host_buf_ptr = nullptr;
-            void *dev_buf_ptr = alloc_single_buffer(sizeof(L2PerfBuffer), &host_buf_ptr);
+            void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuTaskBuffer), &host_buf_ptr);
             if (dev_buf_ptr == nullptr) {
-                LOG_ERROR("Failed to allocate L2PerfBuffer for core %d, buffer %d", i, s);
+                LOG_ERROR("Failed to allocate L2SwimlaneAicpuTaskBuffer for core %d, buffer %d", i, s);
                 return -1;
             }
-            L2PerfBuffer *buf = reinterpret_cast<L2PerfBuffer *>(host_buf_ptr);
-            memset(buf, 0, sizeof(L2PerfBuffer));
+            L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(host_buf_ptr);
+            memset(buf, 0, sizeof(L2SwimlaneAicpuTaskBuffer));
             buf->count = 0;
 
             if (s == 0) {
                 state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
             } else {
-                manager_.push_recycled(static_cast<int>(ProfBufferType::PERF_RECORD), dev_buf_ptr);
+                manager_.push_recycled(static_cast<int>(ProfBufferType::AICPU_TASK), dev_buf_ptr);
             }
         }
         wmb();
@@ -207,27 +207,27 @@ int L2PerfCollector::initialize(
         wmb();
     }
 
-    // Step 5b: Initialize L2PerfAicoreBufferStates — per-core AICore rotation
+    // Step 5b: Initialize L2SwimlaneAicoreTaskPools — per-core AICore rotation
     // channel + buffer pool. Same SPSC pattern as the AICPU pool above.
     for (int i = 0; i < num_aicore; i++) {
-        L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(perf_host_ptr, num_aicore, i);
-        memset(ac_state, 0, sizeof(L2PerfAicoreBufferState));
+        L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(perf_host_ptr, num_aicore, i);
+        memset(ac_state, 0, sizeof(L2SwimlaneAicoreTaskPool));
 
         for (int s = 0; s < PLATFORM_AICORE_BUFFERS_PER_CORE; s++) {
             void *host_buf_ptr = nullptr;
-            void *dev_buf_ptr = alloc_single_buffer(sizeof(L2PerfAicoreBuffer), &host_buf_ptr);
+            void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicoreTaskBuffer), &host_buf_ptr);
             if (dev_buf_ptr == nullptr) {
-                LOG_ERROR("Failed to allocate L2PerfAicoreBuffer for core %d, buffer %d", i, s);
+                LOG_ERROR("Failed to allocate L2SwimlaneAicoreTaskBuffer for core %d, buffer %d", i, s);
                 return -1;
             }
-            L2PerfAicoreBuffer *buf = reinterpret_cast<L2PerfAicoreBuffer *>(host_buf_ptr);
-            memset(buf, 0, sizeof(L2PerfAicoreBuffer));
+            L2SwimlaneAicoreTaskBuffer *buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(host_buf_ptr);
+            memset(buf, 0, sizeof(L2SwimlaneAicoreTaskBuffer));
             buf->count = 0;
 
             if (s == 0) {
                 ac_state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
             } else {
-                manager_.push_recycled(static_cast<int>(ProfBufferType::AICORE), dev_buf_ptr);
+                manager_.push_recycled(static_cast<int>(ProfBufferType::AICORE_TASK), dev_buf_ptr);
             }
         }
         wmb();
@@ -235,15 +235,16 @@ int L2PerfCollector::initialize(
         wmb();
     }
     LOG_DEBUG(
-        "Initialized buffer pools: %d L2PerfBuffers/core + %d L2PerfAicoreBuffers/core (1 in free_queue, "
+        "Initialized buffer pools: %d L2SwimlaneAicpuTaskBuffers/core + %d L2SwimlaneAicoreTaskBuffers/core (1 in "
+        "free_queue, "
         "rest in recycled pool)",
         PLATFORM_PROF_BUFFERS_PER_CORE, PLATFORM_AICORE_BUFFERS_PER_CORE
     );
 
     // Step 5c: Standalone uint64_t[num_aicore] table that will hold per-core
-    // AicoreRotation device addresses. Host only allocates the bytes and
-    // hands the device pointer to AICPU via KernelArgs::aicore_ring_addr;
-    // AICPU itself fills the entries inside `l2_perf_aicpu_init` (it has
+    // L2SwimlaneAicoreRotation device addresses. Host only allocates the bytes and
+    // hands the device pointer to AICPU via KernelArgs::l2_swimlane_aicore_rotation_table;
+    // AICPU itself fills the entries inside `l2_swimlane_aicpu_init` (it has
     // direct access to `&ac_state->rotation` device addresses, no
     // host-to-device translation needed). AICore reads
     // rotation_table[block_idx] at kernel entry.
@@ -252,7 +253,7 @@ int L2PerfCollector::initialize(
         void *rotation_table_host = nullptr;
         void *rotation_table_dev = alloc_single_buffer(table_bytes, &rotation_table_host);
         if (rotation_table_dev == nullptr) {
-            LOG_ERROR("Failed to allocate aicore_ring_addr (rotation) table (%zu bytes)", table_bytes);
+            LOG_ERROR("Failed to allocate l2_swimlane_aicore_rotation_table (rotation) table (%zu bytes)", table_bytes);
             return -1;
         }
         aicore_ring_addr_table_dev_ = rotation_table_dev;
@@ -260,8 +261,8 @@ int L2PerfCollector::initialize(
 
     // Step 6: Initialize PhaseBufferStates — 1 buffer per thread in free_queue, rest to recycled pool
     for (int t = 0; t < num_phase_threads; t++) {
-        PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t);
-        memset(state, 0, sizeof(PhaseBufferState));
+        L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t);
+        memset(state, 0, sizeof(L2SwimlaneAicpuPhasePool));
 
         state->free_queue.head = 0;
         state->free_queue.tail = 0;
@@ -270,19 +271,19 @@ int L2PerfCollector::initialize(
 
         for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) {
             void *host_buf_ptr = nullptr;
-            void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr);
+            void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuPhaseBuffer), &host_buf_ptr);
             if (dev_buf_ptr == nullptr) {
-                LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s);
+                LOG_ERROR("Failed to allocate L2SwimlaneAicpuPhaseBuffer for thread %d, buffer %d", t, s);
                 return -1;
             }
-            PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(host_buf_ptr);
-            memset(buf, 0, sizeof(PhaseBuffer));
+            L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(host_buf_ptr);
+            memset(buf, 0, sizeof(L2SwimlaneAicpuPhaseBuffer));
             buf->count = 0;
 
             if (s == 0) {
                 state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
             } else {
-                manager_.push_recycled(static_cast<int>(ProfBufferType::PHASE), dev_buf_ptr);
+                manager_.push_recycled(static_cast<int>(ProfBufferType::AICPU_PHASE), dev_buf_ptr);
             }
         }
         wmb();
@@ -297,8 +298,8 @@ int L2PerfCollector::initialize(
     wmb();
 
     // Step 7: Stash device pointer for the caller to publish via
-    // kernel_args.l2_perf_data_base (read back via get_l2_perf_setup_device_ptr()).
-    LOG_DEBUG("L2 perf device base = 0x%lx", reinterpret_cast<uint64_t>(perf_dev_ptr));
+    // kernel_args.l2_swimlane_data_base (read back via get_l2_swimlane_setup_device_ptr()).
+    LOG_DEBUG("L2 swimlane device base = 0x%lx", reinterpret_cast<uint64_t>(perf_dev_ptr));
 
     perf_shared_mem_dev_ = perf_dev_ptr;
     shm_host_ = perf_host_ptr;
@@ -315,8 +316,8 @@ int L2PerfCollector::initialize(
 // ProfilerBase callbacks
 // ---------------------------------------------------------------------------
 
-void L2PerfCollector::copy_perf_buffer(const ReadyBufferInfo &info) {
-    L2PerfBuffer *buf = reinterpret_cast<L2PerfBuffer *>(info.host_buffer_ptr);
+void L2SwimlaneCollector::copy_perf_buffer(const ReadyBufferInfo &info) {
+    L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(info.host_buffer_ptr);
     rmb();
     uint32_t count = buf->count;
     if (count > PLATFORM_PROF_BUFFER_SIZE) {
@@ -331,8 +332,8 @@ void L2PerfCollector::copy_perf_buffer(const ReadyBufferInfo &info) {
     }
 }
 
-void L2PerfCollector::copy_phase_buffer(const ReadyBufferInfo &info) {
-    PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(info.host_buffer_ptr);
+void L2SwimlaneCollector::copy_phase_buffer(const ReadyBufferInfo &info) {
+    L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(info.host_buffer_ptr);
     rmb();
     uint32_t count = buf->count;
     if (count > static_cast<uint32_t>(PLATFORM_PHASE_RECORDS_PER_THREAD)) {
@@ -369,8 +370,8 @@ void L2PerfCollector::copy_phase_buffer(const ReadyBufferInfo &info) {
 //     propagated). The "missing" slot's previous contents are zero because
 //     allocate_single_buffer memsets at allocation.
 //   - Flush-path partial buffer whose tail wasn't reached.
-void L2PerfCollector::copy_aicore_buffer(const ReadyBufferInfo &info) {
-    L2PerfAicoreBuffer *buf = reinterpret_cast<L2PerfAicoreBuffer *>(info.host_buffer_ptr);
+void L2SwimlaneCollector::copy_aicore_buffer(const ReadyBufferInfo &info) {
+    L2SwimlaneAicoreTaskBuffer *buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(info.host_buffer_ptr);
     rmb();
     uint32_t core_index = info.index;
     if (core_index >= static_cast<uint32_t>(num_aicore_)) {
@@ -384,7 +385,7 @@ void L2PerfCollector::copy_aicore_buffer(const ReadyBufferInfo &info) {
     dst.reserve(dst.size() + count);
     uint32_t skipped = 0;
     for (uint32_t i = 0; i < count; i++) {
-        const L2PerfAicoreRecord &r = buf->records[i];
+        const L2SwimlaneAicoreTaskRecord &r = buf->records[i];
         if (r.start_time == 0) {
             skipped++;
             continue;
@@ -400,10 +401,10 @@ void L2PerfCollector::copy_aicore_buffer(const ReadyBufferInfo &info) {
     }
 }
 
-void L2PerfCollector::on_buffer_collected(const ReadyBufferInfo &info) {
-    if (info.type == ProfBufferType::PERF_RECORD) {
+void L2SwimlaneCollector::on_buffer_collected(const ReadyBufferInfo &info) {
+    if (info.type == ProfBufferType::AICPU_TASK) {
         copy_perf_buffer(info);
-    } else if (info.type == ProfBufferType::PHASE) {
+    } else if (info.type == ProfBufferType::AICPU_PHASE) {
         copy_phase_buffer(info);
     } else {
         copy_aicore_buffer(info);
@@ -419,7 +420,7 @@ void L2PerfCollector::on_buffer_collected(const ReadyBufferInfo &info) {
 // clear current_buf_ptr on the device side. Host's job here is purely
 // accounting + sanity check.
 
-void L2PerfCollector::reconcile_counters() {
+void L2SwimlaneCollector::reconcile_counters() {
     if (shm_host_ == nullptr) {
         return;
     }
@@ -431,9 +432,7 @@ void L2PerfCollector::reconcile_counters() {
     // queue full / flush failure) bump dropped_record_count.
     //   silent_loss = device_total - (collected + dropped)
     // and any non-zero silent loss flags an unaccounted gap on top of the
-    // already-classified dropped losses. `mismatch_record_count` remains in
-    // L2PerfBufferState for ABI continuity but is no longer written — the
-    // AICore staging-slot read it guarded was removed.
+    // already-classified dropped losses.
     //
     // Sanity sub-check: after stop(), any active buffer with records must
     // have been flushed by AICPU (success → current_buf_ptr=0; failure →
@@ -445,7 +444,7 @@ void L2PerfCollector::reconcile_counters() {
                              auto read_buf_count, uint64_t collected, bool optional) {
         int leftover_active = 0;
         for (int i = 0; i < unit_count; i++) {
-            L2PerfBufferState *state = get_state(i);
+            L2SwimlaneAicpuTaskPool *state = get_state(i);
             uint64_t buf_ptr = state->current_buf_ptr;
             if (buf_ptr == 0) continue;
             void *host_ptr = manager_.resolve_host_ptr(reinterpret_cast<void *>(buf_ptr));
@@ -453,7 +452,7 @@ void L2PerfCollector::reconcile_counters() {
             uint32_t count = read_buf_count(host_ptr);
             if (count == 0) continue;
             LOG_ERROR(
-                "L2Perf reconcile: %s %d has un-flushed %s buffer (current_buf_ptr=0x%lx, count=%u) "
+                "L2Swimlane reconcile: %s %d has un-flushed %s buffer (current_buf_ptr=0x%lx, count=%u) "
                 "after stop() — device flush failed",
                 unit_name, i, kind, static_cast<unsigned long>(buf_ptr), count
             );
@@ -462,55 +461,44 @@ void L2PerfCollector::reconcile_counters() {
 
         uint64_t total_device = 0;
         uint64_t dropped_device = 0;
-        uint64_t mismatch_device = 0;
         for (int i = 0; i < unit_count; i++) {
-            L2PerfBufferState *state = get_state(i);
+            L2SwimlaneAicpuTaskPool *state = get_state(i);
             total_device += state->total_record_count;
             dropped_device += state->dropped_record_count;
-            mismatch_device += state->mismatch_record_count;
         }
 
         // PHASE counters are populated only by runtimes that actually emit
         // phase records; skip the comparison entirely when nothing happened.
-        if (optional && total_device == 0 && collected == 0 && dropped_device == 0 && mismatch_device == 0) {
+        if (optional && total_device == 0 && collected == 0 && dropped_device == 0) {
             return;
         }
 
         if (dropped_device > 0) {
             LOG_WARN(
-                "L2Perf reconcile: %lu %s records dropped on device side (buffer full / "
+                "L2Swimlane reconcile: %lu %s records dropped on device side (buffer full / "
                 "ready_queue full).",
                 static_cast<unsigned long>(dropped_device), kind
             );
         }
-        if (mismatch_device > 0) {
-            LOG_ERROR(
-                "L2Perf reconcile: %lu %s records carry non-zero mismatch_record_count — "
-                "this counter is no longer written post-AICore-as-producer; non-zero "
-                "indicates stale device state or a corrupted L2PerfBufferState",
-                static_cast<unsigned long>(mismatch_device), kind
-            );
-        }
-        uint64_t accounted = collected + dropped_device + mismatch_device;
+        uint64_t accounted = collected + dropped_device;
         if (accounted != total_device) {
             LOG_WARN(
-                "L2Perf reconcile: %s count mismatch (collected=%lu + dropped=%lu + mismatch=%lu != "
+                "L2Swimlane reconcile: %s count mismatch (collected=%lu + dropped=%lu != "
                 "device_total=%lu, silent_loss=%ld)",
                 kind, static_cast<unsigned long>(collected), static_cast<unsigned long>(dropped_device),
-                static_cast<unsigned long>(mismatch_device), static_cast<unsigned long>(total_device),
-                static_cast<long>(total_device) - static_cast<long>(accounted)
+                static_cast<unsigned long>(total_device), static_cast<long>(total_device) - static_cast<long>(accounted)
             );
         } else {
             LOG_INFO_V0(
-                "L2Perf reconcile: %s counts match (collected=%lu, dropped=%lu, mismatch=%lu, device_total=%lu)", kind,
+                "L2Swimlane reconcile: %s counts match (collected=%lu, dropped=%lu, device_total=%lu)", kind,
                 static_cast<unsigned long>(collected), static_cast<unsigned long>(dropped_device),
-                static_cast<unsigned long>(mismatch_device), static_cast<unsigned long>(total_device)
+                static_cast<unsigned long>(total_device)
             );
         }
 
         if (leftover_active > 0) {
             LOG_ERROR(
-                "L2Perf reconcile: %d %s(s) had un-cleared %s current_buf_ptr — see prior errors", leftover_active,
+                "L2Swimlane reconcile: %d %s(s) had un-cleared %s current_buf_ptr — see prior errors", leftover_active,
                 unit_name, kind
             );
         }
@@ -522,7 +510,7 @@ void L2PerfCollector::reconcile_counters() {
             return get_perf_buffer_state(shm_host_, core_index);
         },
         [](void *host_ptr) {
-            return reinterpret_cast<L2PerfBuffer *>(host_ptr)->count;
+            return reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(host_ptr)->count;
         },
         total_perf_collected_, /*optional=*/false
     );
@@ -533,24 +521,25 @@ void L2PerfCollector::reconcile_counters() {
             return get_phase_buffer_state(shm_host_, num_aicore_, thread_index);
         },
         [](void *host_ptr) {
-            return reinterpret_cast<PhaseBuffer *>(host_ptr)->count;
+            return reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(host_ptr)->count;
         },
         total_phase_collected_, /*optional=*/true
     );
 }
 
-void L2PerfCollector::read_phase_header_metadata() {
+void L2SwimlaneCollector::read_phase_header_metadata() {
     if (shm_host_ == nullptr) {
         return;
     }
 
     rmb();
 
-    AicpuPhaseHeader *phase_header = get_phase_header(shm_host_, num_aicore_);
+    L2SwimlaneAicpuPhaseHeader *phase_header = get_phase_header(shm_host_, num_aicore_);
 
-    if (phase_header->magic != AICPU_PHASE_MAGIC) {
+    if (phase_header->magic != L2_SWIMLANE_AICPU_PHASE_MAGIC) {
         LOG_INFO_V0(
-            "No phase profiling data found (magic mismatch: 0x%x vs 0x%x)", phase_header->magic, AICPU_PHASE_MAGIC
+            "No phase profiling data found (magic mismatch: 0x%x vs 0x%x)", phase_header->magic,
+            L2_SWIMLANE_AICPU_PHASE_MAGIC
         );
         return;
     }
@@ -593,7 +582,7 @@ void L2PerfCollector::read_phase_header_metadata() {
     LOG_INFO_V0("Phase metadata collection complete: has_phase_data=%s", has_phase_data_ ? "yes" : "no");
 }
 
-// AICore-as-producer post-processing: walk each L2PerfRecord we collected
+// AICore-as-producer post-processing: walk each L2SwimlaneAicpuTaskRecord we collected
 // and patch start/end/duration from the per-core stream of AICore records
 // that arrived through the ready queue. AICore rotation guarantees each
 // per-core stream is a complete prefix of "all dispatched tasks on this
@@ -601,11 +590,11 @@ void L2PerfCollector::read_phase_header_metadata() {
 // free_queue while the session runs, so an arbitrarily long session works).
 //
 // We build a small `reg_task_id → (start, end)` map per core (size on the
-// order of N_tasks_per_core) and patch each L2PerfRecord by its
+// order of N_tasks_per_core) and patch each L2SwimlaneAicpuTaskRecord by its
 // reg_task_id field. Using a map instead of direct indexing tolerates
-// AICPU-side L2PerfBuffer drops (a missing L2PerfRecord doesn't break
+// AICPU-side L2SwimlaneAicpuTaskBuffer drops (a missing L2SwimlaneAicpuTaskRecord doesn't break
 // alignment) and lets the same code work for both runtimes.
-void L2PerfCollector::join_aicore_records() {
+void L2SwimlaneCollector::join_aicore_records() {
     if (shm_host_ == nullptr) {
         return;
     }
@@ -678,7 +667,8 @@ void L2PerfCollector::join_aicore_records() {
         total_unmatched += unmatched;
         if (unmatched > 0) {
             LOG_WARN(
-                "Core %d: %lu L2PerfRecord(s) had no matching AICore entry (AICore buffer drops on rotation? "
+                "Core %d: %lu L2SwimlaneAicpuTaskRecord(s) had no matching AICore entry (AICore buffer drops on "
+                "rotation? "
                 "PLATFORM_AICORE_BUFFERS_PER_CORE=%d may be undersized for host drain rate)",
                 core_idx, static_cast<unsigned long>(unmatched), PLATFORM_AICORE_BUFFERS_PER_CORE
             );
@@ -691,7 +681,7 @@ void L2PerfCollector::join_aicore_records() {
     );
 }
 
-int L2PerfCollector::export_swimlane_json() {
+int L2SwimlaneCollector::export_swimlane_json() {
     // Step 0: Join AICore-emitted start/end/task_id records into the AICPU
     // record stream (AICore-as-producer design).
     join_aicore_records();
@@ -721,7 +711,7 @@ int L2PerfCollector::export_swimlane_json() {
 
     // Step 3: Flatten per-core vectors into tagged records with core_id derived from index
     struct TaggedRecord {
-        const L2PerfRecord *record;
+        const L2SwimlaneAicpuTaskRecord *record;
         uint32_t core_id;
     };
     std::vector<TaggedRecord> tagged_records;
@@ -769,7 +759,7 @@ int L2PerfCollector::export_swimlane_json() {
 
     // Step 5: Compose output path. Filename is fixed (no timestamp) — the
     // caller-provided directory is the per-task uniqueness boundary.
-    std::string filepath = output_prefix_ + "/l2_perf_records.json";
+    std::string filepath = output_prefix_ + "/l2_swimlane_records.json";
 
     // Step 6: Open JSON file for writing
     std::ofstream outfile(filepath);
@@ -782,16 +772,16 @@ int L2PerfCollector::export_swimlane_json() {
     // Fanout fields are emitted as empty/zero — the device-side hot path no
     // longer carries them. Downstream (swimlane_converter.py) joins fanout
     // from the sibling deps.json (dep_gen output).
-    int l2_perf_level = static_cast<int>(l2_perf_level_);
+    int l2_swimlane_level = static_cast<int>(l2_swimlane_level_);
     outfile << "{\n";
-    outfile << "  \"l2_perf_level\": " << l2_perf_level << ",\n";
+    outfile << "  \"l2_swimlane_level\": " << l2_swimlane_level << ",\n";
     outfile << "  \"tasks\": [\n";
 
     // First pass: filter unmatched records (start_time == 0) so we emit a
     // valid JSON without trailing-comma fix-ups. Unmatched records arise when
     // the AICore-side rotation dropped a buffer (free queue empty) and that
     // task's AICore record never made it to the host, leaving the AICPU-side
-    // L2PerfRecord with `start_time == 0`. Subtracting base_time_cycles from
+    // L2SwimlaneAicpuTaskRecord with `start_time == 0`. Subtracting base_time_cycles from
     // 0 would underflow to a huge double timestamp, painting an off-the-chart
     // bar in the swimlane viewer; safer to drop the record. The drop count is
     // already surfaced via `dropped_record_count` and the join warning logged
@@ -845,12 +835,12 @@ int L2PerfCollector::export_swimlane_json() {
     outfile << "  ]";
 
     // Step 8: Write phase profiling data (level >= 3)
-    if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-        auto sched_phase_name = [](AicpuPhaseId id) -> const char * {
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        auto sched_phase_name = [](L2SwimlaneAicpuPhaseId id) -> const char * {
             switch (id) {
-            case AicpuPhaseId::SCHED_COMPLETE:
+            case L2SwimlaneAicpuPhaseId::SCHED_COMPLETE:
                 return "complete";
-            case AicpuPhaseId::SCHED_DISPATCH:
+            case L2SwimlaneAicpuPhaseId::SCHED_DISPATCH:
                 return "dispatch";
             default:
                 // Legacy SCHED_IDLE_WAIT (3) and SCHED_SCAN (2) land here on
@@ -861,9 +851,9 @@ int L2PerfCollector::export_swimlane_json() {
             }
         };
 
-        auto orch_phase_name = [](AicpuPhaseId id) -> const char * {
+        auto orch_phase_name = [](L2SwimlaneAicpuPhaseId id) -> const char * {
             switch (id) {
-            case AicpuPhaseId::ORCH_SUBMIT:
+            case L2SwimlaneAicpuPhaseId::ORCH_SUBMIT:
                 return "orch_submit";
             default:
                 // Legacy per-sub-step orch ids 17-24 land here on old captures;
@@ -889,7 +879,7 @@ int L2PerfCollector::export_swimlane_json() {
                 // Phase-specific deltas (currently only SCHED_DISPATCH carries
                 // pop_hit / pop_miss). Other phases pass zero extras; omitting
                 // them keeps the JSON terse per record.
-                if (pr.phase_id == AicpuPhaseId::SCHED_DISPATCH) {
+                if (pr.phase_id == L2SwimlaneAicpuPhaseId::SCHED_DISPATCH) {
                     outfile << ", \"pop_hit\": " << pr.extra1 << ", \"pop_miss\": " << pr.extra2;
                 }
                 outfile << "}";
@@ -903,14 +893,14 @@ int L2PerfCollector::export_swimlane_json() {
         outfile << "  ]";
 
         // Orchestrator timing is no longer emitted as a separate aggregate
-        // block. Per-event AicpuPhaseRecord[] entries (emitted as
+        // block. Per-event L2SwimlaneAicpuPhaseRecord[] entries (emitted as
         // aicpu_orchestrator_phases below) are the single source of truth;
         // the run-window envelope is still visible in the device-side
         // LOG_INFO_V9 "Thread N: orch_start=… orch_end=… orch_cost=…" line.
 
         // Per-task orchestrator phase records (level >= 4, filtered from unified collected_phase_records_)
         bool has_orch_phases = false;
-        if (l2_perf_level_ >= L2PerfLevel::ORCH_PHASES) {
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
             for (const auto &v : collected_phase_records_) {
                 for (const auto &r : v) {
                     if (!is_scheduler_phase(r.phase_id)) {
@@ -969,7 +959,7 @@ int L2PerfCollector::export_swimlane_json() {
     return 0;
 }
 
-int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2PerfFreeCallback &free_cb) {
+int L2SwimlaneCollector::finalize(L2SwimlaneUnregisterCallback unregister_cb, const L2SwimlaneFreeCallback &free_cb) {
     if (shm_host_ == nullptr) {
         return 0;
     }
@@ -984,10 +974,10 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe
     // alloc_single_buffer installed via halHostRegister is unregistered
     // before its device memory is freed. Without this the Ascend HAL's
     // per-device registration table accumulates leaked entries across
-    // init_l2_perf() invocations and back-to-back l2_swimlane tests on
+    // init_l2_swimlane() invocations and back-to-back l2_swimlane tests on
     // a reused Worker fail at rc=8 from halHostRegister.
 
-    // Free standalone aicore_ring_addr table
+    // Free standalone l2_swimlane_aicore_rotation_table table
     release_one_buffer(aicore_ring_addr_table_dev_, unregister_cb, free_cb);
     aicore_ring_addr_table_dev_ = nullptr;
 
@@ -998,8 +988,8 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe
 
     // Per-core: current buffer + free_queue slots — these were owned by
     // the AICPU side, not the framework. Same drain pattern for both the
-    // L2PerfBuffer pool and the L2PerfAicoreBuffer pool.
-    auto drain_free_queue = [&](L2PerfFreeQueue &fq) {
+    // L2SwimlaneAicpuTaskBuffer pool and the L2SwimlaneAicoreTaskBuffer pool.
+    auto drain_free_queue = [&](L2SwimlaneFreeQueue &fq) {
         rmb();
         uint32_t head = fq.head;
         uint32_t tail = fq.tail;
@@ -1016,12 +1006,12 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe
     };
 
     for (int i = 0; i < num_aicore_; i++) {
-        L2PerfBufferState *state = get_perf_buffer_state(shm_host_, i);
+        L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm_host_, i);
         release_one_buffer(reinterpret_cast<void *>(state->current_buf_ptr), unregister_cb, free_cb);
         state->current_buf_ptr = 0;
         drain_free_queue(state->free_queue);
 
-        L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(shm_host_, num_aicore_, i);
+        L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(shm_host_, num_aicore_, i);
         release_one_buffer(reinterpret_cast<void *>(ac_state->rotation.current_buf_ptr), unregister_cb, free_cb);
         ac_state->rotation.current_buf_ptr = 0;
         drain_free_queue(ac_state->free_queue);
@@ -1029,7 +1019,7 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe
 
     int num_phase_threads = PLATFORM_MAX_AICPU_THREADS;
     for (int t = 0; t < num_phase_threads; t++) {
-        PhaseBufferState *state = get_phase_buffer_state(shm_host_, num_aicore_, t);
+        L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm_host_, num_aicore_, t);
 
         release_one_buffer(reinterpret_cast<void *>(state->current_buf_ptr), unregister_cb, free_cb);
         state->current_buf_ptr = 0;
diff --git a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp
index c66b11ed9..33b1b6600 100644
--- a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp
+++ b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp
@@ -11,9 +11,9 @@
 
 #include "aicore/aicore.h"
 #include "aicore/aicore_profiling_state.h"
-#include "aicore/l2_perf_collector_aicore.h"
+#include "aicore/l2_swimlane_collector_aicore.h"
 #include "aicore/pmu_collector_aicore.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"  // Platform configuration (C/C++ compatible)
 #include "runtime.h"
 
@@ -55,15 +55,15 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
     dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
 
     uint32_t enable_profiling_flag = get_aicore_profiling_flag();
-    bool l2_perf_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
+    bool l2_swimlane_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
     bool dump_tensor_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
     bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU);
 
-    // Per-core AicoreRotation channel; see tensormap_and_ringbuffer/.../aicore_executor.cpp.
+    // Per-core L2SwimlaneAicoreRotation channel; see tensormap_and_ringbuffer/.../aicore_executor.cpp.
     // Deferred until first task so AICPU's init has populated the rotation
     // table (the dispatch itself proves init is done).
-    __gm__ AicoreRotation *l2_perf_rotation = nullptr;
-    AicoreLocalState l2_perf_local = {nullptr, 0, 0};
+    __gm__ L2SwimlaneAicoreRotation *l2_swimlane_rotation = nullptr;
+    L2SwimlaneAicoreLocalState l2_swimlane_local = {nullptr, 0, 0};
 
     volatile uint32_t task_id = AICPU_IDLE_TASK_ID;
     volatile uint32_t last_task_id = AICPU_IDLE_TASK_ID;
@@ -86,8 +86,8 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
             write_reg(RegId::COND, MAKE_ACK_VALUE(actual_task_id));
 
             // First-task lazy resolve of the rotation channel.
-            if (l2_perf_enabled && l2_perf_rotation == nullptr) {
-                l2_perf_rotation = get_aicore_rotation();
+            if (l2_swimlane_enabled && l2_swimlane_rotation == nullptr) {
+                l2_swimlane_rotation = get_l2_swimlane_aicore_rotation();
             }
 
             __gm__ Task *task_ptr = &(runtime->tasks[actual_task_id]);
@@ -107,9 +107,11 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
                 pipe_barrier(PIPE_ALL);
             }
 
-            if (l2_perf_enabled) {
+            if (l2_swimlane_enabled) {
                 uint64_t end_time = get_sys_cnt_aicore();
-                l2_perf_aicore_record_task(l2_perf_rotation, &l2_perf_local, actual_task_id, start_time, end_time);
+                l2_swimlane_aicore_record_task(
+                    l2_swimlane_rotation, &l2_swimlane_local, actual_task_id, start_time, end_time
+                );
             }
 
             last_task_id = task_id;
diff --git a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
index 62b13670b..770ec41dc 100644
--- a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
@@ -16,13 +16,13 @@
 
 #include "aicpu/device_log.h"
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
 #include "callable.h"
 #include "common/memory_barrier.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 #include "runtime.h"
@@ -140,7 +140,7 @@ struct AicpuExecutor {
 
     inline bool try_dispatch_task(
         int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head,
-        int &ready_count, bool l2_perf_enabled, Runtime &runtime
+        int &ready_count, bool l2_swimlane_enabled, Runtime &runtime
     );
 };
 
@@ -239,7 +239,7 @@ inline void AicpuExecutor::resolve_task_dependencies(
 // Try to dispatch a task from thread-local queue to a core
 inline bool AicpuExecutor::try_dispatch_task(
     int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head, int &ready_count,
-    bool l2_perf_enabled, [[maybe_unused]] Runtime &runtime
+    bool l2_swimlane_enabled, [[maybe_unused]] Runtime &runtime
 ) {
     if (ready_count <= 0) {
         return false;
@@ -251,8 +251,8 @@ inline bool AicpuExecutor::try_dispatch_task(
     ready_count--;
 
     // Profiling: record the real AICPU dispatch point for this core. Buffer
-    // rotation is handled inside l2_perf_aicpu_complete_record.
-    if (l2_perf_enabled && get_l2_perf_level() >= L2PerfLevel::AICPU_TIMING) {
+    // rotation is handled inside l2_swimlane_aicpu_complete_task.
+    if (l2_swimlane_enabled && get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) {
         dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
     }
 
@@ -329,7 +329,7 @@ int AicpuExecutor::init(Runtime *runtime) {
     }
 
     if (is_l2_swimlane_enabled()) {
-        l2_perf_aicpu_init(runtime->worker_count);
+        l2_swimlane_aicpu_init(runtime->worker_count);
     }
 
     // Perform core discovery: handshake with all cores and collect core type information
@@ -679,8 +679,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
 
     int verification_warning_count = 0;
     const int MAX_VERIFICATION_WARNINGS = 10;
-    bool l2_perf_enabled = is_l2_swimlane_enabled();
-    L2PerfLevel l2_perf_level = get_l2_perf_level();
+    bool l2_swimlane_enabled = is_l2_swimlane_enabled();
+    L2SwimlaneLevel l2_swimlane_level = get_l2_swimlane_level();
     // PMU runs require single-issue dispatch — overlapping in-flight tasks
     // pollute per-task PMU counters. Cached at function scope:
     // is_pmu_enabled() is extern "C" and the compiler cannot hoist it
@@ -707,7 +707,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
     );
 
     // Initialize dispatch timestamps for all cores (only needed at level >= 2)
-    if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+    if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
         uint64_t dispatch_start_time = get_sys_cnt_aicpu();
         for (int i = 0; i < core_num; i++) {
             int core_id = cur_thread_cores[i];
@@ -744,38 +744,38 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 // Profiling: when prev_running_id exists, its AICore timing was
                 // published to the ring slot first, so complete it BEFORE the
                 // pending task's record to maintain buffer ordering.
-                if (l2_perf_enabled) {
-                    uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
+                if (l2_swimlane_enabled) {
+                    uint64_t finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
 
                     if (prev_running_id != AICPU_TASK_INVALID) {
                         Task *prev_task = &runtime.tasks[prev_running_id];
-                        if (l2_perf_aicpu_complete_record(
+                        if (l2_swimlane_aicpu_complete_task(
                                 core_id, thread_idx, static_cast<uint32_t>(prev_running_id),
                                 static_cast<uint64_t>(prev_running_id), prev_task->func_id, h->core_type,
                                 dispatch_timestamps_[core_id], finish_ts
                             ) != 0) {
                             LOG_ERROR(
-                                "Core %d: l2_perf_aicpu_complete_record failed for implicit task %d", core_id,
+                                "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id,
                                 prev_running_id
                             );
                         }
-                        if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                        if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                             dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                         }
                     }
 
-                    finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
+                    finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
                     Task *task = &runtime.tasks[completed_task_id];
-                    if (l2_perf_aicpu_complete_record(
+                    if (l2_swimlane_aicpu_complete_task(
                             core_id, thread_idx, static_cast<uint32_t>(completed_task_id),
                             static_cast<uint64_t>(completed_task_id), task->func_id, h->core_type,
                             dispatch_timestamps_[core_id], finish_ts
                         ) != 0) {
                         LOG_ERROR(
-                            "Core %d: l2_perf_aicpu_complete_record failed for task %d", core_id, completed_task_id
+                            "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id
                         );
                     }
-                    if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                    if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                         dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                     }
                 }
@@ -792,12 +792,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) {
                     dispatched = try_dispatch_task(
                         core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head,
-                        cur_aic_ready_count, l2_perf_enabled, runtime
+                        cur_aic_ready_count, l2_swimlane_enabled, runtime
                     );
                 } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) {
                     dispatched = try_dispatch_task(
                         core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head,
-                        cur_aiv_ready_count, l2_perf_enabled, runtime
+                        cur_aiv_ready_count, l2_swimlane_enabled, runtime
                     );
                 }
 
@@ -829,7 +829,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 made_progress = true;
 
                 // Update timestamp if didn't dispatch (try_dispatch_task updates it if dispatched)
-                if (!dispatched && l2_perf_enabled && l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                if (!dispatched && l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                     dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                 }
             } else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) {
@@ -851,20 +851,21 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 // Count it here to avoid losing completion.
                 if (prev_running_id != AICPU_TASK_INVALID) {
                     // Profiling: complete the implicit task's AICore record
-                    if (l2_perf_enabled) {
-                        uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
+                    if (l2_swimlane_enabled) {
+                        uint64_t finish_ts =
+                            (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
                         Task *prev_task = &runtime.tasks[prev_running_id];
-                        if (l2_perf_aicpu_complete_record(
+                        if (l2_swimlane_aicpu_complete_task(
                                 core_id, thread_idx, static_cast<uint32_t>(prev_running_id),
                                 static_cast<uint64_t>(prev_running_id), prev_task->func_id, h->core_type,
                                 dispatch_timestamps_[core_id], finish_ts
                             ) != 0) {
                             LOG_ERROR(
-                                "Core %d: l2_perf_aicpu_complete_record failed for implicit task %d", core_id,
+                                "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id,
                                 prev_running_id
                             );
                         }
-                        if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                        if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                             dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                         }
                     }
@@ -894,19 +895,19 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
 
                 int completed_task_id = running_task_ids_[core_id];
 
-                if (l2_perf_enabled) {
-                    uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
+                if (l2_swimlane_enabled) {
+                    uint64_t finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
                     Task *task = &runtime.tasks[completed_task_id];
-                    if (l2_perf_aicpu_complete_record(
+                    if (l2_swimlane_aicpu_complete_task(
                             core_id, thread_idx, static_cast<uint32_t>(completed_task_id),
                             static_cast<uint64_t>(completed_task_id), task->func_id, h->core_type,
                             dispatch_timestamps_[core_id], finish_ts
                         ) != 0) {
                         LOG_ERROR(
-                            "Core %d: l2_perf_aicpu_complete_record failed for task %d", core_id, completed_task_id
+                            "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id
                         );
                     }
-                    if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                    if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                         dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                     }
                 }
@@ -921,12 +922,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                     if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) {
                         dispatched = try_dispatch_task(
                             core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head,
-                            cur_aic_ready_count, l2_perf_enabled, runtime
+                            cur_aic_ready_count, l2_swimlane_enabled, runtime
                         );
                     } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) {
                         dispatched = try_dispatch_task(
                             core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head,
-                            cur_aiv_ready_count, l2_perf_enabled, runtime
+                            cur_aiv_ready_count, l2_swimlane_enabled, runtime
                         );
                     }
                 }
@@ -940,7 +941,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 made_progress = true;
 
                 // Update timestamp if didn't dispatch (try_dispatch_task updates it if dispatched)
-                if (!dispatched && l2_perf_enabled && l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                if (!dispatched && l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                     dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                 }
             }
@@ -953,14 +954,14 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) {
                     if (try_dispatch_task(
                             core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head,
-                            cur_aic_ready_count, l2_perf_enabled, runtime
+                            cur_aic_ready_count, l2_swimlane_enabled, runtime
                         )) {
                         made_progress = true;
                     }
                 } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) {
                     if (try_dispatch_task(
                             core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head,
-                            cur_aiv_ready_count, l2_perf_enabled, runtime
+                            cur_aiv_ready_count, l2_swimlane_enabled, runtime
                         )) {
                         made_progress = true;
                     }
@@ -1099,7 +1100,7 @@ int AicpuExecutor::run(Runtime *runtime) {
 
     // Flush performance buffers for cores managed by this thread
     if (is_l2_swimlane_enabled()) {
-        l2_perf_aicpu_flush_buffers(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]);
+        l2_swimlane_aicpu_flush(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]);
     }
 #if PTO2_PROFILING
     if (is_pmu_enabled()) {
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index 346c3b9fd..44473ee0d 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -38,7 +38,7 @@
 #include <vector>
 
 #include "common/core_type.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "pto_runtime2_types.h"
 #include "tensor_info.h"
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index 94e18b35e..488cb1785 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -11,9 +11,9 @@
 
 #include "aicore/aicore.h"
 #include "aicore/aicore_profiling_state.h"
-#include "aicore/l2_perf_collector_aicore.h"
+#include "aicore/l2_swimlane_collector_aicore.h"
 #include "aicore/pmu_collector_aicore.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"  // Register-based communication
 #include "pto2_dispatch_payload.h"
 #include "runtime.h"
@@ -57,8 +57,8 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ PTO2Di
  * args pointer from it on each dispatch. reg_val is a monotonically
  * increasing task ID used only for dispatch signaling and ACK/FIN protocol.
  *
- * Profiling state (enable flag, L2 perf ring) is published into the platform
- * via set_aicore_profiling_flag / set_aicore_l2_perf_ring at kernel entry —
+ * Profiling state (enable flag, L2 swimlane rotation channel) is published into the platform
+ * via set_aicore_profiling_flag / set_aicore_l2_swimlane_ring at kernel entry —
  * this routine reads it through the matching getters, so neither Handshake
  * nor this signature carry profiling fields.
  *
@@ -98,19 +98,19 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
     __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task);
 
     uint32_t enable_profiling_flag = get_aicore_profiling_flag();
-    bool l2_perf_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
+    bool l2_swimlane_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
     bool dump_tensor_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
     bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU);
 
-    // Per-core AicoreRotation channel. The pointer to THIS core's rotation
-    // is stored in `KernelArgs::aicore_ring_addr[block_idx]`, but AICPU
-    // populates that table inside `l2_perf_aicpu_init` which runs
+    // Per-core L2SwimlaneAicoreRotation channel. The pointer to THIS core's rotation
+    // is stored in `KernelArgs::l2_swimlane_aicore_rotation_table[block_idx]`, but AICPU
+    // populates that table inside `l2_swimlane_aicpu_init` which runs
     // concurrently with this kernel's entry — so we cannot deref at startup.
-    // Defer the deref via `get_aicore_rotation()` until the first task is
+    // Defer the deref via `get_l2_swimlane_aicore_rotation()` until the first task is
     // dispatched; by then AICPU's init has completed (the very dispatch is
     // proof of that).
-    __gm__ AicoreRotation *l2_perf_rotation = nullptr;
-    AicoreLocalState l2_perf_local = {nullptr, 0, 0};
+    __gm__ L2SwimlaneAicoreRotation *l2_swimlane_rotation = nullptr;
+    L2SwimlaneAicoreLocalState l2_swimlane_local = {nullptr, 0, 0};
 
     // Phase 4: Main execution loop - poll register for tasks until exit signal
     // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
@@ -135,10 +135,10 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
             uint32_t task_id = reg_val;  // Decode: register holds task_id directly
 
             // First-task lazy resolve of the rotation channel — see comment
-            // above. `get_aicore_rotation()` caches after first call so this
+            // above. `get_l2_swimlane_aicore_rotation()` caches after first call so this
             // costs nothing on subsequent tasks.
-            if (l2_perf_enabled && l2_perf_rotation == nullptr) {
-                l2_perf_rotation = get_aicore_rotation();
+            if (l2_swimlane_enabled && l2_swimlane_rotation == nullptr) {
+                l2_swimlane_rotation = get_l2_swimlane_aicore_rotation();
             }
 
             // Select dual-buffer slot: same bit as AICPU used when writing payload
@@ -169,9 +169,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
             }
 
             // Performance profiling: record task execution
-            if (l2_perf_enabled) {
+            if (l2_swimlane_enabled) {
                 uint64_t end_time = get_sys_cnt_aicore();
-                l2_perf_aicore_record_task(l2_perf_rotation, &l2_perf_local, task_id, start_time, end_time);
+                l2_swimlane_aicore_record_task(l2_swimlane_rotation, &l2_swimlane_local, task_id, start_time, end_time);
             }
 
             last_reg_val = reg_val;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 7a4966361..a7464b25e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -35,11 +35,11 @@
 #include "pto_shared_memory.h"
 
 // Performance profiling headers
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/scope_stats_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
 #include "aicpu/dep_gen_collector_aicpu.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/unified_log.h"
 
 // Register-based communication
@@ -521,7 +521,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // device address nor know the SchedulerContext's core fan-out).
             runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
 #if PTO2_PROFILING
-            rt->orchestrator.l2_perf_level = get_l2_perf_level();
+            rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level();
             {
                 auto &orch = rt->orchestrator;
                 for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
@@ -547,8 +547,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             sched_ctx_.wait_pto2_init_complete();
 
 #if PTO2_PROFILING
-            if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) {
-                l2_perf_aicpu_set_orch_thread_idx(thread_idx);
+            if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) {
+                l2_swimlane_aicpu_set_orch_thread_idx(thread_idx);
             }
 #endif
 
@@ -663,7 +663,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
             // below carries the same envelope info for debugging, and
             // host-side swimlane derives per-phase timing from the per-event
-            // AicpuPhaseRecord[] stream that already covers everything inside
+            // L2SwimlaneAicpuPhaseRecord[] stream that already covers everything inside
             // submit_task().
             int32_t total_tasks = 0;
             if (rt->orchestrator.sm_header) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index 863299dbc..e670688a6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -166,8 +166,8 @@ Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX
 ```
 
 Per-thread fanout / fanin edge counts and ready-queue pop hit / miss
-stats live in `aicpu_scheduler_phases[]` (in `l2_perf_records.json`
-captured at l2_perf_level >= 3) and `deps.json`; consume them via
+stats live in `aicpu_scheduler_phases[]` (in `l2_swimlane_records.json`
+captured at l2_swimlane_level >= 3) and `deps.json`; consume them via
 `simpler_setup/tools/sched_overhead_analysis.py`.
 
 ---
@@ -241,10 +241,10 @@ mirrors the PMU pattern — two independent channels (one binary, one int):
   (`PROFILING_FLAG_L2_SWIMLANE`). Set by the host whenever level > 0; read
   by AICore (which only needs on/off to decide whether to write timing) and
   by AICPU kernel entry via `set_l2_swimlane_enabled(bool)`.
-- **Granular level (0–4)** — `L2PerfDataHeader::l2_perf_level`
-  (shared memory). Host writes it in `L2PerfCollector::initialize`; AICPU
-  promotes it from the header in `l2_perf_aicpu_init` and exposes it via
-  `get_l2_perf_level()` (typed `L2PerfLevel`) for
+- **Granular level (0–4)** — `L2SwimlaneDataHeader::l2_swimlane_level`
+  (shared memory). Host writes it in `L2SwimlaneCollector::initialize`; AICPU
+  promotes it from the header in `l2_swimlane_aicpu_init` and exposes it via
+  `get_l2_swimlane_level()` (typed `L2SwimlaneLevel`) for
   `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates.
 
 On sim, the binary on/off travels via the dlsym'd `set_l2_swimlane_enabled`
@@ -266,7 +266,7 @@ Bare `--enable-l2-swimlane` = level 4 (backward compatible).
 
 ### Level gating in AICPU code
 
-Use the strongly-typed `L2PerfLevel` enum so each gate names the
+Use the strongly-typed `L2SwimlaneLevel` enum so each gate names the
 content it depends on instead of relying on magic numbers:
 
 ```cpp
@@ -275,19 +275,19 @@ content it depends on instead of relying on magic numbers:
 if (is_l2_swimlane_enabled()) { ... }
 
 // AICPU dispatch/finish timestamps.
-// Granular checks below require l2_perf_aicpu_init to have already run
+// Granular checks below require l2_swimlane_aicpu_init to have already run
 // (so the level has been promoted from the shared-memory header).
-if (get_l2_perf_level() >= L2PerfLevel::AICPU_TIMING) { ... }
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { ... }
 
 // Scheduler main-loop phase records (SCHED_*)
-if (get_l2_perf_level() >= L2PerfLevel::SCHED_PHASES) { ... }
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::SCHED_PHASES) { ... }
 
 // Orchestrator phase records
-if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { ... }
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { ... }
 ```
 
-`L2PerfLevel` is defined in `common/l2_perf_profiling.h` with
-underlying type `uint32_t` (matches the `L2PerfDataHeader::l2_perf_level`
+`L2SwimlaneLevel` is defined in `common/l2_swimlane_profiling.h` with
+underlying type `uint32_t` (matches the `L2SwimlaneDataHeader::l2_swimlane_level`
 shared-memory field and mirrors `PmuEventType : uint32_t`):
 
 | Enumerator | Underlying value |
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h
index daef4dfdd..2ea3d5768 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h
@@ -25,7 +25,7 @@
  * extra I/O and an extra file in the output directory.
  *
  * deps.json is the sole source of truth for fanout: the L2 swimlane hot
- * path no longer records ``L2PerfRecord::fanout[]`` (taking the per-task
+ * path no longer records ``L2SwimlaneAicpuTaskRecord::fanout[]`` (taking the per-task
  * 1 KB GM store off the scheduler critical path). Replay sees every
  * submit and reconstructs the complete dependency graph.
  *
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 6ac98b8db..70a1cacde 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -57,7 +57,7 @@ static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot
 // link these no-op stubs so the runtime translation unit is self-contained.
 // Visibility is hidden so the HOST .so doesn't export them into the global
 // dynamic symbol table where they'd shadow the AICPU .so's strong symbols
-// (same pattern as get_sys_cnt_aicpu / l2_perf_aicpu_record_orch_phase below).
+// (same pattern as get_sys_cnt_aicpu / l2_swimlane_aicpu_record_orch_phase below).
 extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; }
 __attribute__((weak, visibility("hidden"))) void
 dep_gen_aicpu_record_submit(uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *) {}
@@ -73,7 +73,7 @@ extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabl
 // =============================================================================
 #if PTO2_ORCH_PROFILING
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 // Weak fallback for builds that don't link device_time.cpp (e.g. host).
 // The strong symbol from platform/.../device_time.cpp wins in the AICPU build.
 //
@@ -86,11 +86,11 @@ extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabl
 // so the AICPU .so's PLT resolves to its own strong definition from
 // device_time.cpp.
 __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-// Weak fallback for builds that don't link l2_perf_collector_aicpu.cpp.
+// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp.
 // The strong symbol from the AICPU build wins when profiling is available.
 // Also hidden to prevent HOST .so from polluting the global symbol table.
 __attribute__((weak, visibility("hidden"))) void
-l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
+l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
 // Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
 static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
 static uint64_t g_orch_alloc_cycle = 0;      // unified task+heap alloc
@@ -116,9 +116,9 @@ uint64_t g_orch_scope_end_atomic_count = 0;
 // in favour of the cumulatives + per-submit envelope; the dispatcher
 // already inserts one record at the end of each submit path via
 // CYCLE_COUNT_ORCH_SUBMIT_RECORD.
-#define CYCLE_COUNT_START()                                                \
-    bool _prof_active = (orch->l2_perf_level >= L2PerfLevel::ORCH_PHASES); \
-    uint64_t _t0 = get_sys_cnt_aicpu(), _t1;                               \
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
+    uint64_t _t0 = get_sys_cnt_aicpu(), _t1;                                       \
     uint64_t _submit_start_ts = _t0
 #define CYCLE_COUNT_LAP(acc)       \
     do {                           \
@@ -126,37 +126,37 @@ uint64_t g_orch_scope_end_atomic_count = 0;
         acc += (_t1 - _t0);        \
         _t0 = _t1;                 \
     } while (0)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                \
-    do {                                                                                   \
-        if (_prof_active) {                                                                \
-            l2_perf_aicpu_record_orch_phase(                                               \
-                AicpuPhaseId::ORCH_SUBMIT, _submit_start_ts, _t1, g_orch_submit_idx, (tid) \
-            );                                                                             \
-        }                                                                                  \
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                          \
+    do {                                                                                             \
+        if (_prof_active) {                                                                          \
+            l2_swimlane_aicpu_record_orch_phase(                                                     \
+                L2SwimlaneAicpuPhaseId::ORCH_SUBMIT, _submit_start_ts, _t1, g_orch_submit_idx, (tid) \
+            );                                                                                       \
+        }                                                                                            \
     } while (0)
 #elif PTO2_PROFILING
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
 __attribute__((weak, visibility("hidden"))) void
-l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
+l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
 // submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
 static uint32_t g_orch_submit_idx = 0;
-#define CYCLE_COUNT_START()                                                \
-    bool _prof_active = (orch->l2_perf_level >= L2PerfLevel::ORCH_PHASES); \
-    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0;        \
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
+    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0;                \
     uint64_t _submit_start_ts = _t0
 #define CYCLE_COUNT_LAP(acc) \
     do {                     \
     } while (0)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                \
-    do {                                                                                   \
-        if (_prof_active) {                                                                \
-            _t1 = get_sys_cnt_aicpu();                                                     \
-            l2_perf_aicpu_record_orch_phase(                                               \
-                AicpuPhaseId::ORCH_SUBMIT, _submit_start_ts, _t1, g_orch_submit_idx, (tid) \
-            );                                                                             \
-        }                                                                                  \
+#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                          \
+    do {                                                                                             \
+        if (_prof_active) {                                                                          \
+            _t1 = get_sys_cnt_aicpu();                                                               \
+            l2_swimlane_aicpu_record_orch_phase(                                                     \
+                L2SwimlaneAicpuPhaseId::ORCH_SUBMIT, _submit_start_ts, _t1, g_orch_submit_idx, (tid) \
+            );                                                                                       \
+        }                                                                                            \
     } while (0)
 #else
 #define CYCLE_COUNT_START()
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 7dd47b19a..ff905c16d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -28,7 +28,7 @@
 #ifndef PTO_ORCHESTRATOR_H
 #define PTO_ORCHESTRATOR_H
 
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "device_arena.h"
 #include "pto_ring_buffer.h"
 #include "pto_runtime2_types.h"
@@ -93,8 +93,8 @@ struct PTO2OrchestratorState {
     int32_t total_cluster_count{0};  // AIC cores = MIX clusters
     int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
 #if PTO2_PROFILING
-    // L2 perf_level copied from get_l2_perf_level().
-    L2PerfLevel l2_perf_level{L2PerfLevel::DISABLED};
+    // L2 swimlane_level copied from get_l2_swimlane_level().
+    L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED};
 #endif
 
     // === GM HEAP (for output buffers) ===
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index a829fecd0..c6bbd0395 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -37,7 +37,7 @@
 #include <vector>
 
 #include "common/core_type.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "pto2_dispatch_payload.h"
 #include "task_args.h"
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index f072d012c..88c8a749b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -15,11 +15,11 @@
 
 #include "common/unified_log.h"
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "common/memory_barrier.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "pto_runtime2.h"
 #include "pto_shared_memory.h"
@@ -377,30 +377,32 @@ int32_t SchedulerContext::handle_timeout_exit(
 }
 
 #if PTO2_PROFILING
-void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_thread_completed) {
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) {
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
     uint64_t sched_end_ts = get_sys_cnt_aicpu();
     LOG_INFO_V9(
         "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx,
-        static_cast<uint64_t>(l2_perf.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
-        cycles_to_us(sched_end_ts - l2_perf.sched_start_ts)
+        static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
+        cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
     );
 
-    uint64_t sched_total = l2_perf.sched_wiring_cycle + l2_perf.sched_complete_cycle + l2_perf.sched_dispatch_cycle +
-                           l2_perf.sched_idle_cycle;
+    uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
+                           l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle;
     if (sched_total == 0) sched_total = 1;
 
 #if PTO2_SCHED_PROFILING
     {
         PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
         uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
-        uint64_t complete_poll = (l2_perf.sched_complete_cycle > otc_total + l2_perf.sched_complete_perf_cycle) ?
-                                     (l2_perf.sched_complete_cycle - otc_total - l2_perf.sched_complete_perf_cycle) :
-                                     0;
-        uint64_t dispatch_poll =
-            (l2_perf.sched_dispatch_cycle > l2_perf.sched_dispatch_pop_cycle + l2_perf.sched_dispatch_setup_cycle) ?
-                (l2_perf.sched_dispatch_cycle - l2_perf.sched_dispatch_pop_cycle - l2_perf.sched_dispatch_setup_cycle) :
+        uint64_t complete_poll =
+            (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ?
+                (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) :
                 0;
+        uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle >
+                                  l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ?
+                                     (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle -
+                                      l2_swimlane.sched_dispatch_setup_cycle) :
+                                     0;
 
         LOG_INFO_V9(
             "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx,
@@ -411,20 +413,21 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
         // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges
         // × core_to_thread).
         LOG_INFO_V9(
-            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_complete_cycle),
-            l2_perf.sched_complete_cycle * 100.0 / sched_total
+            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle),
+            l2_swimlane.sched_complete_cycle * 100.0 / sched_total
         );
 
-        uint64_t c_parent = l2_perf.sched_complete_cycle > 0 ? l2_perf.sched_complete_cycle : 1;
-        uint64_t complete_miss_count = (l2_perf.complete_probe_count > l2_perf.complete_hit_count) ?
-                                           (l2_perf.complete_probe_count - l2_perf.complete_hit_count) :
+        uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1;
+        uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ?
+                                           (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) :
                                            0;
-        double complete_hit_rate =
-            l2_perf.complete_probe_count > 0 ? l2_perf.complete_hit_count * 100.0 / l2_perf.complete_probe_count : 0.0;
+        double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ?
+                                       l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count :
+                                       0.0;
         LOG_INFO_V9(
             "Thread %d:     poll         : %.3fus (%.1f%%)  hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%",
             thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent,
-            static_cast<uint64_t>(l2_perf.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
+            static_cast<uint64_t>(l2_swimlane.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
             complete_hit_rate
         );
         LOG_INFO_V9(
@@ -451,7 +454,8 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
         );
         LOG_INFO_V9(
             "Thread %d:     perf         : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_perf.sched_complete_perf_cycle), l2_perf.sched_complete_perf_cycle * 100.0 / c_parent
+            cycles_to_us(l2_swimlane.sched_complete_perf_cycle),
+            l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent
         );
 
         // pop_hit / pop_miss per-emit deltas live in each dispatch-phase
@@ -459,65 +463,67 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
         // the run-cumulative tracked in this struct (final-drain emit covers
         // the trailing-idle tail).
         LOG_INFO_V9(
-            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_dispatch_cycle),
-            l2_perf.sched_dispatch_cycle * 100.0 / sched_total
+            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle),
+            l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total
         );
-        uint64_t global_dispatch_count = l2_perf.pop_hit - l2_perf.local_dispatch_count;
-        uint64_t total_dispatched = l2_perf.local_dispatch_count + global_dispatch_count;
-        double local_hit_rate = total_dispatched > 0 ? l2_perf.local_dispatch_count * 100.0 / total_dispatched : 0.0;
+        uint64_t global_dispatch_count = l2_swimlane.pop_hit - l2_swimlane.local_dispatch_count;
+        uint64_t total_dispatched = l2_swimlane.local_dispatch_count + global_dispatch_count;
+        double local_hit_rate =
+            total_dispatched > 0 ? l2_swimlane.local_dispatch_count * 100.0 / total_dispatched : 0.0;
         LOG_INFO_V9(
             "Thread %d:     local_disp   : local=%" PRIu64 ", global=%" PRIu64 ", overflow=%" PRIu64
             ", local_rate=%.1f%%",
-            thread_idx, static_cast<uint64_t>(l2_perf.local_dispatch_count),
-            static_cast<uint64_t>(global_dispatch_count), static_cast<uint64_t>(l2_perf.local_overflow_count),
+            thread_idx, static_cast<uint64_t>(l2_swimlane.local_dispatch_count),
+            static_cast<uint64_t>(global_dispatch_count), static_cast<uint64_t>(l2_swimlane.local_overflow_count),
             local_hit_rate
         );
 
-        uint64_t d_parent = l2_perf.sched_dispatch_cycle > 0 ? l2_perf.sched_dispatch_cycle : 1;
+        uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1;
         LOG_INFO_V9(
             "Thread %d:     poll         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll),
             dispatch_poll * 100.0 / d_parent
         );
         LOG_INFO_V9(
             "Thread %d:     pop          : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(l2_perf.sched_dispatch_pop_cycle), l2_perf.sched_dispatch_pop_cycle * 100.0 / d_parent,
-            cycles_to_us(l2_perf.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent,
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
             static_cast<uint64_t>(sp.pop_atomic_count)
         );
         LOG_INFO_V9(
             "Thread %d:     setup        : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_perf.sched_dispatch_setup_cycle), l2_perf.sched_dispatch_setup_cycle * 100.0 / d_parent
+            cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle),
+            l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent
         );
 
 #if PTO2_SCHED_PROFILING
         LOG_INFO_V9(
             "Thread %d:   wiring         : %.3fus (%.1f%%)  tasks=%d", thread_idx,
-            cycles_to_us(l2_perf.sched_wiring_cycle), l2_perf.sched_wiring_cycle * 100.0 / sched_total,
-            l2_perf.phase_wiring_count
+            cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total,
+            l2_swimlane.phase_wiring_count
         );
 #else
         LOG_INFO_V9(
-            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_wiring_cycle),
-            l2_perf.sched_wiring_cycle * 100.0 / sched_total
+            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle),
+            l2_swimlane.sched_wiring_cycle * 100.0 / sched_total
         );
 #endif
 
         LOG_INFO_V9(
-            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_idle_cycle),
-            l2_perf.sched_idle_cycle * 100.0 / sched_total
+            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle),
+            l2_swimlane.sched_idle_cycle * 100.0 / sched_total
         );
 
         if (cur_thread_completed > 0) {
             LOG_INFO_V9(
                 "Thread %d:   avg/complete   : %.3fus", thread_idx,
-                cycles_to_us(l2_perf.sched_complete_cycle) / cur_thread_completed
+                cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed
             );
         }
     }
 #endif
     LOG_INFO_V9(
         "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
-        cycles_to_us(sched_total), static_cast<uint64_t>(l2_perf.sched_loop_count), cur_thread_completed
+        cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
     );
 }
 #endif
@@ -832,15 +838,15 @@ int32_t SchedulerContext::init(
     regs_ = regs_base;
 
 #if PTO2_PROFILING
-    // l2_perf_aicpu_init promotes g_l2_perf_level from the shared-memory
+    // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory
     // header — must be called BEFORE caching the level, otherwise the cached
     // value would still be 0 (only the binary enable bit has been seeded by
     // kernel.cpp at this point).
     if (is_l2_swimlane_enabled()) {
-        l2_perf_aicpu_init(runtime->worker_count);
-        l2_perf_level_ = get_l2_perf_level();
-        if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-            l2_perf_aicpu_init_phase(runtime->worker_count, sched_thread_num_);
+        l2_swimlane_aicpu_init(runtime->worker_count);
+        l2_swimlane_level_ = get_l2_swimlane_level();
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_thread_num_);
         }
     }
 #endif
@@ -965,9 +971,9 @@ void SchedulerContext::on_orchestration_done(
     Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
 ) {
 #if PTO2_PROFILING
-    if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
         // Flush orchestrator's phase record buffer
-        l2_perf_aicpu_flush_phase_buffers(thread_idx);
+        l2_swimlane_aicpu_flush_phase_buffers(thread_idx);
     }
 #endif
 
@@ -1020,10 +1026,10 @@ void SchedulerContext::on_orchestration_done(
     // Write core-to-thread mapping AFTER reassignment so the profiling data
     // reflects the final distribution (all active_sched_threads_, including
     // former orchestrator threads when orch_to_sched_ is enabled).
-    if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-        l2_perf_aicpu_init_core_assignments(cores_total_num_);
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
         for (int32_t t = 0; t < active_sched_threads_; t++) {
-            l2_perf_aicpu_write_core_assignments_for_thread(
+            l2_swimlane_aicpu_write_core_assignments_for_thread(
                 t, core_trackers_[t].core_ids(), core_trackers_[t].core_num()
             );
         }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
index ea6ab5e01..b18091841 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
@@ -13,7 +13,7 @@
 #include "common/unified_log.h"
 #include "aicpu/device_time.h"
 #include "aicpu/platform_regs.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "pto_runtime2.h"
@@ -21,7 +21,7 @@
 #include "spin_hint.h"
 
 // Performance profiling headers
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
 
@@ -77,7 +77,7 @@ void SchedulerContext::complete_slot_task(
 #endif
 ) {
 #if PTO2_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #else
     (void)hank;
 #endif
@@ -130,7 +130,7 @@ void SchedulerContext::complete_slot_task(
         sched_->on_mixed_task_complete(slot_state, local_bufs);
 #endif
 #if PTO2_PROFILING
-        l2_perf.phase_complete_count++;
+        l2_swimlane.phase_complete_count++;
 #endif
         if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) {
             deferred_release_slot_states[deferred_release_count++] = &slot_state;
@@ -151,24 +151,24 @@ void SchedulerContext::complete_slot_task(
     }
 
 #if PTO2_PROFILING
-    if (l2_perf.l2_perf_enabled) {
+    if (l2_swimlane.l2_swimlane_enabled) {
 #if PTO2_SCHED_PROFILING
         uint64_t t_perf_start = get_sys_cnt_aicpu();
 #endif
-        uint64_t finish_ts = (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
+        uint64_t finish_ts = (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
 
         int32_t perf_slot_idx = static_cast<int32_t>(subslot);
-        if (l2_perf_aicpu_complete_record(
+        if (l2_swimlane_aicpu_complete_task(
                 core_id, thread_idx, static_cast<uint32_t>(expected_reg_task_id), slot_state.task->task_id.raw,
                 slot_state.task->kernel_id[perf_slot_idx], hank[core_id].core_type, dispatch_ts, finish_ts
             ) != 0) {
             LOG_ERROR(
-                "Core %d: l2_perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id,
+                "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id,
                 static_cast<uint64_t>(slot_state.task->task_id.raw)
             );
         }
 #if PTO2_SCHED_PROFILING
-        l2_perf.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
+        l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
 #endif
     }
 
@@ -205,7 +205,7 @@ void SchedulerContext::check_running_cores_for_completion(
     PTO2LocalReadyBuffer *local_bufs
 ) {
 #if PTO2_SCHED_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #endif
     CoreTracker &tracker = core_trackers_[thread_idx];
     auto running_core_states = tracker.get_all_running_cores();
@@ -227,8 +227,8 @@ void SchedulerContext::check_running_cores_for_completion(
         int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
 
 #if PTO2_SCHED_PROFILING
-        if (l2_perf.l2_perf_enabled) {
-            l2_perf.complete_probe_count++;
+        if (l2_swimlane.l2_swimlane_enabled) {
+            l2_swimlane.complete_probe_count++;
         }
 #endif
 
@@ -237,8 +237,8 @@ void SchedulerContext::check_running_cores_for_completion(
         if (!t.matched) continue;
 
 #if PTO2_SCHED_PROFILING
-        if (l2_perf.l2_perf_enabled && (t.running_done || t.pending_done)) {
-            l2_perf.complete_hit_count++;
+        if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) {
+            l2_swimlane.complete_hit_count++;
         }
 #endif
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 7ed8c6bb0..7886fb6bc 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -11,7 +11,7 @@
 #ifndef SCHEDULER_CONTEXT_H
 #define SCHEDULER_CONTEXT_H
 
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/unified_log.h"
 #include "scheduler_types.h"
 
@@ -135,10 +135,10 @@ class SchedulerContext {
     SyncStartDrainState drain_state_;
 
 #if PTO2_PROFILING
-    SchedL2PerfCounters sched_l2_perf_[MAX_AICPU_THREADS];
-    // Cached once at init() from get_l2_perf_level(), AFTER
-    // l2_perf_aicpu_init has promoted the level from the shared-memory header.
-    L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};
+    SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS];
+    // Cached once at init() from get_l2_swimlane_level(), AFTER
+    // l2_swimlane_aicpu_init has promoted the level from the shared-memory header.
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
 #endif
 
     // --- Task-execution tracking ---
@@ -349,7 +349,7 @@ class SchedulerContext {
     );
 
 #if PTO2_PROFILING
-    __attribute__((noinline, cold)) void log_l2_perf_summary(int32_t thread_idx, int32_t cur_thread_completed);
+    __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed);
 #endif
 
     // =========================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index 0239ea218..3d5d95540 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -19,7 +19,7 @@
 #include "aicpu/device_time.h"
 #include "aicpu/platform_regs.h"
 #include "callable.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "pto_runtime2.h"
@@ -27,7 +27,7 @@
 #include "spin_hint.h"
 
 // Performance profiling headers
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
 
@@ -79,15 +79,15 @@ int SchedulerContext::pop_ready_tasks_batch(
     PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
 ) {
 #if PTO2_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #if PTO2_SCHED_PROFILING
     extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
     uint64_t t_pop_start = get_sys_cnt_aicpu();
     int count = sched_->get_ready_tasks_batch(
         shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx],
-        l2_perf.local_dispatch_count
+        l2_swimlane.local_dispatch_count
     );
-    l2_perf.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
+    l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
 #else
     int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
 #endif
@@ -95,9 +95,9 @@ int SchedulerContext::pop_ready_tasks_batch(
     // so dispatch-phase records in aicpu_scheduler_phases[] carry queue-health
     // stats on default builds.
     if (count > 0) {
-        l2_perf.pop_hit += count;
+        l2_swimlane.pop_hit += count;
     } else {
-        l2_perf.pop_miss++;
+        l2_swimlane.pop_miss++;
     }
 #else
     (void)thread_idx;
@@ -159,7 +159,7 @@ void SchedulerContext::dispatch_subtask_to_core(
         core_exec_state.pending_slot_state = &slot_state;
         core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
 #if PTO2_PROFILING
-        if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) {
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
             core_exec_state.pending_dispatch_timestamp = get_sys_cnt_aicpu();
         }
 #endif
@@ -168,7 +168,7 @@ void SchedulerContext::dispatch_subtask_to_core(
         core_exec_state.running_slot_state = &slot_state;
         core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
 #if PTO2_PROFILING
-        if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) {
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
             core_exec_state.running_dispatch_timestamp = get_sys_cnt_aicpu();
         }
 #endif
@@ -246,7 +246,7 @@ void SchedulerContext::dispatch_block(
         dispatch_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx);
     }
 #if PTO2_PROFILING
-    sched_l2_perf_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask());
+    sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask());
 #endif
 }
 
@@ -255,7 +255,7 @@ void SchedulerContext::dispatch_shape(
     CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
 ) {
 #if PTO2_SCHED_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #endif
     if (entered_drain) return;
 
@@ -323,7 +323,7 @@ void SchedulerContext::dispatch_shape(
             }
             made_progress = true;
 #if PTO2_SCHED_PROFILING
-            l2_perf.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
+            l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
 #endif
         }
 
@@ -352,7 +352,7 @@ void SchedulerContext::dispatch_ready_tasks(
     const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
 
 #if PTO2_SCHED_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #endif
 
     // Note: flush_local_bufs is invoked multiple times per pass (mid-function
@@ -366,7 +366,7 @@ void SchedulerContext::dispatch_ready_tasks(
         for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
             auto &lb = local_bufs[s];
 #if PTO2_SCHED_PROFILING
-            l2_perf.local_overflow_count += lb.count;
+            l2_swimlane.local_overflow_count += lb.count;
 #endif
             if (lb.count > 0) {
                 sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
@@ -511,9 +511,9 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     int32_t idle_iterations = 0;
     int32_t last_progress_count = 0;
 #if PTO2_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
-    l2_perf.reset();
-    l2_perf.l2_perf_enabled = (l2_perf_level_ != L2PerfLevel::DISABLED);
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+    l2_swimlane.reset();
+    l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
 #endif
 
     constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
@@ -534,7 +534,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     const bool pmu_active = is_pmu_enabled();
 
 #if PTO2_PROFILING
-    l2_perf.sched_start_ts = get_sys_cnt_aicpu();
+    l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
 #endif
 
     while (true) {
@@ -544,7 +544,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
         bool made_progress = false;
 #if PTO2_PROFILING
         CYCLE_COUNT_START();
-        l2_perf.sched_loop_count++;
+        l2_swimlane.sched_loop_count++;
         uint64_t _t0_phase = _t0;
 #endif
         int32_t task_count = 0;
@@ -559,7 +559,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
         }
 
 #if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle);
+        CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
 #endif
 
         // Phase 1: Check running cores for completion
@@ -621,16 +621,16 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 
 #if PTO2_PROFILING
         if (!try_completed) {
-            CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle);
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
         } else {
-            CYCLE_COUNT_LAP(l2_perf.sched_complete_cycle);
-            if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES && l2_perf.phase_complete_count > 0) {
-                l2_perf_aicpu_record_phase(
-                    thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, l2_perf.sched_loop_count,
-                    l2_perf.phase_complete_count
+            CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle);
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_complete_count > 0) {
+                l2_swimlane_aicpu_record_phase(
+                    thread_idx, L2SwimlaneAicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                    l2_swimlane.phase_complete_count
                 );
                 _t0_phase = _t1;
-                l2_perf.phase_complete_count = 0;
+                l2_swimlane.phase_complete_count = 0;
             }
         }
 #endif
@@ -649,12 +649,12 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             if (wired > 0) {
                 made_progress = true;
 #if PTO2_SCHED_PROFILING
-                l2_perf.phase_wiring_count += wired;
+                l2_swimlane.phase_wiring_count += wired;
 #endif
             }
         }
 #if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_perf.sched_wiring_cycle);
+        CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle);
 #endif
 
         // Phase 3b: Drain dummy ready queue (thread 0 only).
@@ -704,28 +704,28 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 
 #if PTO2_PROFILING
         if (!try_pushed) {
-            CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle);
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
         } else {
-            CYCLE_COUNT_LAP(l2_perf.sched_dispatch_cycle);
-            if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES && l2_perf.phase_dispatch_count > 0) {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle);
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_dispatch_count > 0) {
                 // Per-emit pop deltas via snapshot diff; the cumulative
                 // pop_hit / pop_miss stay intact for the cold-path log.
-                uint64_t pop_hit_delta = l2_perf.pop_hit - l2_perf.pop_hit_at_last_emit;
-                uint64_t pop_miss_delta = l2_perf.pop_miss - l2_perf.pop_miss_at_last_emit;
-                // AicpuPhaseRecord's extras are uint32 — a delta that overflows means
+                uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+                uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
+                // L2SwimlaneAicpuPhaseRecord's extras are uint32 — a delta that overflows means
                 // an emit was missed for ~4 billion pops, which is well outside any
                 // realistic dispatch cadence and silently truncates without this guard.
                 debug_assert(pop_hit_delta < (1ULL << 32));
                 debug_assert(pop_miss_delta < (1ULL << 32));
-                l2_perf_aicpu_record_phase(
-                    thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, l2_perf.sched_loop_count,
-                    l2_perf.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
+                l2_swimlane_aicpu_record_phase(
+                    thread_idx, L2SwimlaneAicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                    l2_swimlane.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
                     static_cast<uint32_t>(pop_miss_delta)
                 );
                 _t0_phase = _t1;
-                l2_perf.phase_dispatch_count = 0;
-                l2_perf.pop_hit_at_last_emit = l2_perf.pop_hit;
-                l2_perf.pop_miss_at_last_emit = l2_perf.pop_miss;
+                l2_swimlane.phase_dispatch_count = 0;
+                l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+                l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
             }
         }
 #endif
@@ -760,21 +760,21 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
                     thread_idx, header, runtime, idle_iterations, last_progress_count
 #if PTO2_PROFILING
                     ,
-                    l2_perf.sched_start_ts
+                    l2_swimlane.sched_start_ts
 #endif
                 );
             } else {
                 SPIN_WAIT_HINT();
             }
 #if PTO2_PROFILING
-            CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle);
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
             // Idle iterations no longer emit a phase record. Host tooling
             // recovers idle spans from the gap between consecutive sched
             // phase records on the same thread. _t0_phase still advances
             // so the next emitted COMPLETE/DISPATCH gets the correct
             // start_time (the iter it actually ran in), not the start of
             // the preceding idle stretch.
-            if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
                 _t0_phase = _t1;
             }
 #endif
@@ -801,31 +801,31 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     // sum(record.pop_*) reconciles with the run-cumulative counter.
     // Gate on SCHED_PHASES — at lower levels the phase buffer is never
     // flushed (see below), so writing this record would be wasted work.
-    if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-        uint64_t final_pop_hit_delta = l2_perf.pop_hit - l2_perf.pop_hit_at_last_emit;
-        uint64_t final_pop_miss_delta = l2_perf.pop_miss - l2_perf.pop_miss_at_last_emit;
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+        uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
         debug_assert(final_pop_hit_delta < (1ULL << 32));
         debug_assert(final_pop_miss_delta < (1ULL << 32));
         if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) {
             uint64_t t_now = get_sys_cnt_aicpu();
-            l2_perf_aicpu_record_phase(
-                thread_idx, AicpuPhaseId::SCHED_DISPATCH, t_now, t_now, l2_perf.sched_loop_count, 0,
+            l2_swimlane_aicpu_record_phase(
+                thread_idx, L2SwimlaneAicpuPhaseId::SCHED_DISPATCH, t_now, t_now, l2_swimlane.sched_loop_count, 0,
                 static_cast<uint32_t>(final_pop_hit_delta), static_cast<uint32_t>(final_pop_miss_delta)
             );
-            l2_perf.pop_hit_at_last_emit = l2_perf.pop_hit;
-            l2_perf.pop_miss_at_last_emit = l2_perf.pop_miss;
+            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
         }
     }
-    log_l2_perf_summary(thread_idx, cur_thread_completed);
+    log_l2_swimlane_summary(thread_idx, cur_thread_completed);
 #endif
 
 #if PTO2_PROFILING
-    if (l2_perf.l2_perf_enabled) {
-        l2_perf_aicpu_flush_buffers(
+    if (l2_swimlane.l2_swimlane_enabled) {
+        l2_swimlane_aicpu_flush(
             thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
         );
-        if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-            l2_perf_aicpu_flush_phase_buffers(thread_idx);
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            l2_swimlane_aicpu_flush_phase_buffers(thread_idx);
         }
     }
 #endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
index a73e1b0b9..00ceef76d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
@@ -341,8 +341,8 @@ struct SlotTransition {
 // =============================================================================
 
 #if PTO2_PROFILING
-struct alignas(64) SchedL2PerfCounters {
-    bool l2_perf_enabled{false};
+struct alignas(64) SchedL2SwimlaneCounters {
+    bool l2_swimlane_enabled{false};
     uint64_t sched_start_ts{0};
     uint64_t sched_complete_cycle{0};
     uint64_t sched_dispatch_cycle{0};
@@ -369,7 +369,7 @@ struct alignas(64) SchedL2PerfCounters {
     uint64_t sched_dispatch_pop_cycle{0};
     uint64_t sched_dispatch_setup_cycle{0};
 #endif
-    void reset() { *this = SchedL2PerfCounters{}; }
+    void reset() { *this = SchedL2SwimlaneCounters{}; }
 };
 #endif
 
diff --git a/src/a5/platform/include/aicore/aicore_profiling_state.h b/src/a5/platform/include/aicore/aicore_profiling_state.h
index 53acce990..70c062a88 100644
--- a/src/a5/platform/include/aicore/aicore_profiling_state.h
+++ b/src/a5/platform/include/aicore/aicore_profiling_state.h
@@ -25,7 +25,7 @@
  *
  * Lifecycle:
  *   1. Host fills `KernelArgs::enable_profiling_flag`, the two per-core
- *      ring address arrays (`aicore_l2_perf_ring_addrs`,
+ *      ring address arrays (`aicore_l2_swimlane_ring_addrs`,
  *      `aicore_pmu_ring_addrs`), and `regs` (the per-physical-core
  *      register-base array — already required for AICPU).
  *   2. AICore kernel entry indexes the ring arrays by `block_idx` and
@@ -44,7 +44,7 @@
 #include <cstdint>
 
 #include "aicore/aicore.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/pmu_profiling.h"
 
 /**
@@ -56,12 +56,12 @@ __aicore__ void set_aicore_profiling_flag(uint32_t flag);
 __aicore__ uint32_t get_aicore_profiling_flag();
 
 /**
- * Per-core L2Perf staging ring. Set once at kernel entry from
- * `((__gm__ uint64_t*)k_args->aicore_l2_perf_ring_addrs)[block_idx]`;
+ * Per-core L2Swimlane staging ring. Set once at kernel entry from
+ * `((__gm__ uint64_t*)k_args->aicore_l2_swimlane_ring_addrs)[block_idx]`;
  * nullptr when the L2 swimlane bit is off or the address table is null.
  */
-__aicore__ void set_aicore_l2_perf_ring(__gm__ L2PerfAicoreRing *ring);
-__aicore__ __gm__ L2PerfAicoreRing *get_aicore_l2_perf_ring();
+__aicore__ void set_aicore_l2_swimlane_ring(__gm__ L2SwimlaneAicoreRing *ring);
+__aicore__ __gm__ L2SwimlaneAicoreRing *get_aicore_l2_swimlane_ring();
 
 /**
  * Per-core PMU staging ring (a5-only — AICore writes the snapshot).
diff --git a/src/a5/platform/include/aicore/l2_perf_collector_aicore.h b/src/a5/platform/include/aicore/l2_swimlane_collector_aicore.h
similarity index 73%
rename from src/a5/platform/include/aicore/l2_perf_collector_aicore.h
rename to src/a5/platform/include/aicore/l2_swimlane_collector_aicore.h
index b10ebb32f..2bdfceaa5 100644
--- a/src/a5/platform/include/aicore/l2_perf_collector_aicore.h
+++ b/src/a5/platform/include/aicore/l2_swimlane_collector_aicore.h
@@ -9,18 +9,18 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * @file l2_perf_collector_aicore.h
+ * @file l2_swimlane_collector_aicore.h
  * @brief AICore performance data collection interface
  *
  * Provides lightweight performance recording interface for AICore kernels.
  * Uses dcci for efficient cache management instead of memory barriers.
  */
 
-#ifndef PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_
-#define PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_
+#ifndef PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_
+#define PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_
 
 #include "aicore/aicore.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 
 // Include platform-specific timestamp implementation
@@ -35,29 +35,30 @@
 /**
  * Record task execution performance data
  *
- * Writes timing metrics to the per-core L2PerfAicoreRing slot
+ * Writes timing metrics to the per-core L2SwimlaneAicoreRing slot
  * (`dual_issue_slots[task_id % PLATFORM_L2_AICORE_RING_SIZE]`). The
  * ring is allocated once by the host and never reassigned, so AICore writes
  * to a stable address regardless of AICPU buffer rotations. AICPU reads the
- * slot in `l2_perf_aicpu_complete_record` and commits the record into the
- * rotating L2PerfBuffer.
+ * slot in `l2_swimlane_aicpu_complete_task` and commits the record into the
+ * rotating L2SwimlaneAicpuTaskBuffer.
  *
- * AICore writes L2PerfRecord.task_id as the register dispatch token (low 32 bits, zero-extended).
+ * AICore writes L2SwimlaneAicpuTaskRecord.task_id as the register dispatch token (low 32 bits, zero-extended).
  * For tensormap_and_ringbuffer, AICPU overwrites with the full (ring_id << 32) | local_id
  * encoding after handshake match.
  *
- * @param ring        Per-core L2PerfAicoreRing pointer (from get_aicore_l2_perf_ring())
+ * @param ring        Per-core L2SwimlaneAicoreRing pointer (from get_aicore_l2_swimlane_ring())
  * @param task_id     Register dispatch id (DATA_MAIN_BASE), stored in task_id low 32 bits
  * @param start_time  Start timestamp
  * @param end_time    End timestamp
  */
-__aicore__ __attribute__((always_inline)) static inline void
-l2_perf_aicore_record_task(__gm__ L2PerfAicoreRing *ring, uint32_t task_id, uint64_t start_time, uint64_t end_time) {
+__aicore__ __attribute__((always_inline)) static inline void l2_swimlane_aicore_record_task(
+    __gm__ L2SwimlaneAicoreRing *ring, uint32_t task_id, uint64_t start_time, uint64_t end_time
+) {
     // Modulo-indexed slot. PLATFORM_L2_AICORE_RING_SIZE is conventionally a
     // power of two so the compiler reduces this to a mask, but using `%`
     // keeps the index correct if the ring size is ever retuned to a
     // non-power-of-two value (matches the a2a3 convention).
-    __gm__ L2PerfRecord *record = &ring->dual_issue_slots[task_id % PLATFORM_L2_AICORE_RING_SIZE];
+    __gm__ L2SwimlaneAicpuTaskRecord *record = &ring->dual_issue_slots[task_id % PLATFORM_L2_AICORE_RING_SIZE];
 
     record->start_time = start_time;
     record->end_time = end_time;
@@ -71,4 +72,4 @@ l2_perf_aicore_record_task(__gm__ L2PerfAicoreRing *ring, uint32_t task_id, uint
     dsb((mem_dsb_t)0);
 }
 
-#endif  // PLATFORM_AICORE_L2_PERF_COLLECTOR_AICORE_H_
+#endif  // PLATFORM_AICORE_L2_SWIMLANE_COLLECTOR_AICORE_H_
diff --git a/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h
similarity index 67%
rename from src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h
rename to src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h
index fac7a691c..e3680fd17 100644
--- a/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h
+++ b/src/a5/platform/include/aicpu/l2_swimlane_collector_aicpu.h
@@ -9,18 +9,18 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 /**
- * @file l2_perf_collector_aicpu.h
+ * @file l2_swimlane_collector_aicpu.h
  * @brief AICPU performance data collection interface
  *
  * Provides performance profiling management interface for AICPU side.
  * Handles buffer initialization, switching, and flushing.
  */
 
-#ifndef PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_
-#define PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_
+#ifndef PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_
+#define PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_
 
 #include "common/core_type.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 
 // Include platform-specific timestamp implementation
 // Build system selects the correct inner_aicpu.h based on platform:
@@ -30,51 +30,51 @@
 // ============= Public Interface =============
 
 /**
- * L2 perf platform setters — called by the host (sim) or the AICPU kernel
- * entry (onboard) before `l2_perf_aicpu_init()` so AICPU code can read perf
+ * L2 swimlane platform setters — called by the host (sim) or the AICPU kernel
+ * entry (onboard) before `l2_swimlane_aicpu_init()` so AICPU code can read perf
  * state without reaching into the generic `Runtime` struct.
  *
  * Two-channel level transport (mirrors the PMU pattern):
  *   - binary on/off — `enable_profiling_flag` bit1 → `set_l2_swimlane_enabled(bool)`
  *     at kernel entry; queried via `is_l2_swimlane_enabled()`.
- *   - granular L2PerfLevel — `L2PerfDataHeader::l2_perf_level` (shared memory);
- *     read in `l2_perf_aicpu_init` and cached, then queried via
- *     `get_l2_perf_level()` for `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates.
+ *   - granular L2SwimlaneLevel — `L2SwimlaneDataHeader::l2_swimlane_level` (shared memory);
+ *     read in `l2_swimlane_aicpu_init` and cached, then queried via
+ *     `get_l2_swimlane_level()` for `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates.
  */
-extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base);
-extern "C" uint64_t get_platform_l2_perf_base();
+extern "C" void set_platform_l2_swimlane_base(uint64_t l2_swimlane_data_base);
+extern "C" uint64_t get_platform_l2_swimlane_base();
 extern "C" void set_l2_swimlane_enabled(bool enable);
 extern "C" bool is_l2_swimlane_enabled();
 
 // Typed getter for the granular perf_level (promoted from the shared-memory
-// header inside l2_perf_aicpu_init). Gate sites should use this so the
-// comparison RHS is a named L2PerfLevel constant.
-L2PerfLevel get_l2_perf_level();
+// header inside l2_swimlane_aicpu_init). Gate sites should use this so the
+// comparison RHS is a named L2SwimlaneLevel constant.
+L2SwimlaneLevel get_l2_swimlane_level();
 
 /**
  * Initialize performance profiling for `worker_count` cores.
  *
  * Caches per-core BufferState (including stable AICore staging-ring
- * pointers `state.aicore_ring_ptr`) and pops the initial L2PerfBuffer from
+ * pointers `state.aicore_ring_ptr`) and pops the initial L2SwimlaneAicpuTaskBuffer from
  * each free_queue. Reads the perf device-base pointer published via
- * `set_platform_l2_perf_base()`. Does **not** write any Handshake field —
+ * `set_platform_l2_swimlane_base()`. Does **not** write any Handshake field —
  * profiling state lives in `KernelArgs` + AICore platform-owned slots.
  *
  * @param worker_count Number of active AICore workers
  */
-void l2_perf_aicpu_init(int worker_count);
+void l2_swimlane_aicpu_init(int worker_count);
 
 /**
- * Complete a L2PerfRecord with AICPU-side metadata after AICore task completion
+ * Complete a L2SwimlaneAicpuTaskRecord with AICPU-side metadata after AICore task completion
  *
- * Reads from the per-core L2PerfAicoreRing dual-issue slot
+ * Reads from the per-core L2SwimlaneAicoreRing dual-issue slot
  * (`s_perf_aicore_rings[core_id]->dual_issue_slots[reg_task_id & ...]`),
  * validates task_id match, and commits the record into
  * `state->current_buf_ptr->records[count++]`. Callers must pre-extract
  * fanout into a plain uint64_t array (platform layer cannot depend on
  * runtime linked-list types).
  *
- * @param core_id               Core ID owning the destination buffer (resolved via s_perf_buffer_states)
+ * @param core_id               Core ID owning the destination buffer (resolved via s_aicpu_task_pools)
  * @param thread_idx            Owning AICPU thread (used when rotating records buffer)
  * @param expected_reg_task_id  Register dispatch token (low 32 bits) to validate
  * @param task_id               Task identifier to write (PTO2 encoding or plain id)
@@ -89,7 +89,7 @@ void l2_perf_aicpu_init(int worker_count);
  * flush()-clearing current_buf_ptr deterministically halts subsequent commits
  * (they take the dropped path). Same shape as a2a3.
  */
-int l2_perf_aicpu_complete_record(
+int l2_swimlane_aicpu_complete_task(
     int core_id, int thread_idx, uint32_t expected_reg_task_id, uint64_t task_id, uint32_t func_id, CoreType core_type,
     uint64_t dispatch_time, uint64_t finish_time, const uint64_t *fanout, int32_t fanout_count
 );
@@ -103,23 +103,23 @@ int l2_perf_aicpu_complete_record(
  * @param cur_thread_cores Array of core IDs managed by this thread
  * @param core_num Number of cores managed by this thread
  */
-void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, int core_num);
+void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int core_num);
 
 /**
  * Initialize AICPU phase profiling
  *
- * Sets up AicpuPhaseHeader and clears per-thread phase record buffers.
- * Must be called once from thread 0 after l2_perf_aicpu_init().
+ * Sets up L2SwimlaneAicpuPhaseHeader and clears per-thread phase record buffers.
+ * Must be called once from thread 0 after l2_swimlane_aicpu_init().
  *
  * @param worker_count Number of AICore workers (used to locate phase region)
  * @param num_sched_threads Number of scheduler threads
  */
-void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads);
+void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads);
 
 /**
  * Record a single scheduler phase
  *
- * Appends an AicpuPhaseRecord to the specified thread's buffer.
+ * Appends an L2SwimlaneAicpuPhaseRecord to the specified thread's buffer.
  * When the buffer is full, switches to a new buffer via FreeQueue.
  *
  * @param thread_idx Scheduler thread index
@@ -130,12 +130,12 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads);
  * @param tasks_processed Number of tasks processed in this batch (scheduler phases), or
  *                        full PTO2 task_id encoding (ring_id << 32) | local_id (orchestrator
  *                        phases in tensormap_and_ringbuffer)
- * @param extra1, extra2  Phase-specific delta counters (see AicpuPhaseRecord doc).
+ * @param extra1, extra2  Phase-specific delta counters (see L2SwimlaneAicpuPhaseRecord doc).
  *                        SCHED_DISPATCH uses extra1=pop_hit, extra2=pop_miss; other
  *                        phases pass 0.
  */
-void l2_perf_aicpu_record_phase(
-    int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
+void l2_swimlane_aicpu_record_phase(
+    int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
     uint64_t tasks_processed, uint32_t extra1 = 0, uint32_t extra2 = 0
 );
 
@@ -143,16 +143,16 @@ void l2_perf_aicpu_record_phase(
  * Set orchestrator thread index for per-task phase recording
  *
  * Must be called once from the orchestrator thread before any
- * l2_perf_aicpu_record_orch_phase() calls.
+ * l2_swimlane_aicpu_record_orch_phase() calls.
  *
  * @param thread_idx Thread index for the orchestrator (typically num_sched_threads)
  */
-void l2_perf_aicpu_set_orch_thread_idx(int thread_idx);
+void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx);
 
 /**
  * Record a single orchestrator phase
  *
- * Appends an AicpuPhaseRecord for one sub-step of submit_task().
+ * Appends an L2SwimlaneAicpuPhaseRecord for one sub-step of submit_task().
  * Uses the orchestrator's dedicated buffer slot (set via set_orch_thread_idx).
  *
  * @param phase_id Orchestrator phase identifier (ORCH_SYNC..ORCH_SCOPE_END)
@@ -162,28 +162,28 @@ void l2_perf_aicpu_set_orch_thread_idx(int thread_idx);
  * @param task_id Task identifier. For tensormap_and_ringbuffer, this is the full PTO2 encoding:
  * (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler swimlanes.
  */
-void l2_perf_aicpu_record_orch_phase(
-    AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
+void l2_swimlane_aicpu_record_orch_phase(
+    L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
 );
 
 /**
  * Write core-to-thread assignment mapping to shared memory.
  *
- * Callers invoke `l2_perf_aicpu_init_core_assignments(total_cores)` once, then
- * `l2_perf_aicpu_write_core_assignments_for_thread(t, ids, n)` for every
+ * Callers invoke `l2_swimlane_aicpu_init_core_assignments(total_cores)` once, then
+ * `l2_swimlane_aicpu_write_core_assignments_for_thread(t, ids, n)` for every
  * scheduler thread.
  */
-void l2_perf_aicpu_init_core_assignments(int total_cores);
-void l2_perf_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num);
+void l2_swimlane_aicpu_init_core_assignments(int total_cores);
+void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num);
 
 /**
  * Flush remaining phase records for a thread
  *
  * Marks the current WRITING phase buffer as READY and enqueues it
- * for host collection. Called at thread exit (analogous to l2_perf_aicpu_flush_buffers).
+ * for host collection. Called at thread exit (analogous to l2_swimlane_aicpu_flush).
  *
  * @param thread_idx Thread index (scheduler thread or orchestrator)
  */
-void l2_perf_aicpu_flush_phase_buffers(int thread_idx);
+void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx);
 
-#endif  // PLATFORM_AICPU_L2_PERF_COLLECTOR_AICPU_H_
+#endif  // PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_
diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h
index 67d7a5f1e..34d9a08a8 100644
--- a/src/a5/platform/include/common/kernel_args.h
+++ b/src/a5/platform/include/common/kernel_args.h
@@ -69,19 +69,21 @@ struct KernelArgs {
     DeviceArgs *device_args{nullptr};                       // Device arguments (AICPU reads, contains SO info)
     __may_used_by_aicore__ Runtime *runtime_args{nullptr};  // Task runtime in device memory
     uint64_t regs{0};                                       // Per-core register base address array (platform-specific)
-    uint64_t dump_data_base{0};     // Dump shared memory base address; use explicit flags to detect enablement
-    uint64_t l2_perf_data_base{0};  // L2 perf shared memory base address; use explicit flags to detect enablement
-    uint64_t pmu_data_base{0};      // PMU buffer base address (device memory); 0 = PMU disabled
+    uint64_t dump_data_base{0};  // Dump shared memory base address; use explicit flags to detect enablement
+    uint64_t l2_swimlane_data_base{
+        0
+    };  // L2 swimlane shared memory base address; use explicit flags to detect enablement
+    uint64_t pmu_data_base{0};  // PMU buffer base address (device memory); 0 = PMU disabled
     // Profiling per-core address arrays (moved out of Handshake). Each *_addrs
     // field is a device pointer to uint64_t[num_aicore]. AICore KERNEL_ENTRY
     // indexes by block_idx and forwards into per-core platform state.
-    uint64_t aicore_l2_perf_ring_addrs{0};  // L2PerfAicoreRing* per core; 0 when L2 swimlane is off
-    uint64_t aicore_pmu_ring_addrs{0};      // PmuAicoreRing* per core; 0 when PMU is off
-    uint64_t scope_stats_data_base{0};      // ScopeStatsBuffer device pointer; 0 when scope_stats is off.
-                                            // a5 has no halHostRegister — host keeps a separate shadow and
-                                            // refreshes it via rtMemcpy DEVICE_TO_HOST at dump time.
-    uint32_t log_level{1};                  // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
-    uint32_t log_info_v{5};                 // INFO verbosity threshold (0..9); default V5
+    uint64_t aicore_l2_swimlane_ring_addrs{0};  // L2SwimlaneAicoreRing* per core; 0 when L2 swimlane is off
+    uint64_t aicore_pmu_ring_addrs{0};          // PmuAicoreRing* per core; 0 when PMU is off
+    uint64_t scope_stats_data_base{0};          // ScopeStatsBuffer device pointer; 0 when scope_stats is off.
+                                                // a5 has no halHostRegister — host keeps a separate shadow and
+                                                // refreshes it via rtMemcpy DEVICE_TO_HOST at dump time.
+    uint32_t log_level{1};                      // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
+    uint32_t log_info_v{5};                     // INFO verbosity threshold (0..9); default V5
     uint32_t enable_profiling_flag{0};  // Profiling umbrella bitmask; dump_tensor|l2_swimlane|pmu|dep_gen|scope_stats
     uint32_t _pad{0};                   // Alignment padding
 
diff --git a/src/a5/platform/include/common/l2_perf_profiling.h b/src/a5/platform/include/common/l2_swimlane_profiling.h
similarity index 69%
rename from src/a5/platform/include/common/l2_perf_profiling.h
rename to src/a5/platform/include/common/l2_swimlane_profiling.h
index e4ede5d0f..363d60330 100644
--- a/src/a5/platform/include/common/l2_perf_profiling.h
+++ b/src/a5/platform/include/common/l2_swimlane_profiling.h
@@ -10,49 +10,49 @@
  */
 
 /**
- * @file l2_perf_profiling.h
+ * @file l2_swimlane_profiling.h
  * @brief Performance profiling data structures
  *
  * Architecture: Fixed header + per-core/thread buffer states + optional phase profiling region
  *
  * Memory layout (shared memory between Host and Device):
  * ┌─────────────────────────────────────────────────────────────┐
- * │ L2PerfDataHeader (fixed header)                             │
+ * │ L2SwimlaneDataHeader (fixed header)                             │
  * │  - ReadyQueue (FIFO, capacity=PLATFORM_PROF_READYQUEUE_SIZE)│
  * │  - Metadata (num_cores, flags)                              │
  * ├─────────────────────────────────────────────────────────────┤
- * │ L2PerfBufferState[0] (Core 0)                               │
+ * │ L2SwimlaneAicpuTaskPool[0] (Core 0)                               │
  * │  - free_queue: SPSC queue of available buffer pointers      │
  * │  - current_buf_ptr, current_buf_seq                         │
  * ├─────────────────────────────────────────────────────────────┤
- * │ L2PerfBufferState[1] (Core 1)                               │
+ * │ L2SwimlaneAicpuTaskPool[1] (Core 1)                               │
  * ├─────────────────────────────────────────────────────────────┤
  * │ ...                                                         │
  * ├─────────────────────────────────────────────────────────────┤
- * │ L2PerfBufferState[num_cores-1]                              │
+ * │ L2SwimlaneAicpuTaskPool[num_cores-1]                              │
  * ├─────────────────────────────────────────────────────────────┤
- * │ AicpuPhaseHeader (optional, present when phase profiling)   │
+ * │ L2SwimlaneAicpuPhaseHeader (optional, present when phase profiling)   │
  * │  - magic, num_sched_threads, records_per_thread             │
  * │  - core_to_thread mapping                                   │
  * ├─────────────────────────────────────────────────────────────┤
- * │ PhaseBufferState[thread0]                                   │
+ * │ L2SwimlaneAicpuPhasePool[thread0]                                   │
  * │  - free_queue: SPSC queue of available buffer pointers      │
  * │  - current_buf_ptr, current_buf_seq                         │
  * ├─────────────────────────────────────────────────────────────┤
- * │ PhaseBufferState[thread1]                                   │
+ * │ L2SwimlaneAicpuPhasePool[thread1]                                   │
  * ├─────────────────────────────────────────────────────────────┤
  * │ ...                                                         │
  * └─────────────────────────────────────────────────────────────┘
  *
- * Actual L2PerfBuffer / PhaseBuffer are allocated dynamically by Host
+ * Actual L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer are allocated dynamically by Host
  * and pushed into the per-core/thread free_queue.
  *
- * Base size = sizeof(L2PerfDataHeader) + num_cores * sizeof(L2PerfBufferState)
- * With phases = Base + sizeof(AicpuPhaseHeader) + num_threads * sizeof(PhaseBufferState)
+ * Base size = sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool)
+ * With phases = Base + sizeof(L2SwimlaneAicpuPhaseHeader) + num_threads * sizeof(L2SwimlaneAicpuPhasePool)
  */
 
-#ifndef SRC_A5_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_
-#define SRC_A5_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_
+#ifndef SRC_A5_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_
+#define SRC_A5_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_
 
 #include <cstdint>
 #include <vector>
@@ -60,13 +60,13 @@
 #include "common/core_type.h"
 #include "common/platform_config.h"
 
-// Maximum number of successor tasks per L2PerfRecord (matches Task::fanout)
+// Maximum number of successor tasks per L2SwimlaneAicpuTaskRecord (matches Task::fanout)
 #ifndef RUNTIME_MAX_FANOUT
 #define RUNTIME_MAX_FANOUT 128
 #endif
 
 // =============================================================================
-// L2 perf_level — granularity ladder for the L2 swimlane profiler.
+// L2 swimlane_level — granularity ladder for the L2 swimlane profiler.
 //
 // Each level is a strict superset of the previous: higher levels add the data
 // described by their name on top of all lower-level data. Naming describes
@@ -74,12 +74,12 @@
 // naturally — e.g. `if (level >= SCHED_PHASES)` means "this section runs when
 // scheduler phase records are being collected (or any higher tier)".
 //
-// Transported via `L2PerfDataHeader::l2_perf_level` (host → AICPU,
+// Transported via `L2SwimlaneDataHeader::l2_swimlane_level` (host → AICPU,
 // shared memory) and `CallConfig::enable_l2_swimlane` (Python → C). The wire
 // representation stays integer (uint32_t / int32_t) for ABI stability; this
 // enum is the canonical in-code type used for comparisons.
 // =============================================================================
-enum class L2PerfLevel : uint32_t {
+enum class L2SwimlaneLevel : uint32_t {
     DISABLED = 0,       // No collection at all
     AICORE_TIMING = 1,  // AICore per-task start/end timestamps + task record buffer
     AICPU_TIMING = 2,   // + AICPU dispatch/finish timestamps + fanout dependency list
@@ -88,13 +88,13 @@ enum class L2PerfLevel : uint32_t {
 };
 
 // =============================================================================
-// L2PerfRecord - Single Task Execution Record
+// L2SwimlaneAicpuTaskRecord - Single Task Execution Record
 // =============================================================================
 
 /**
  * Single task execution record
  */
-struct L2PerfRecord {
+struct L2SwimlaneAicpuTaskRecord {
     // Timing information (device clock timestamps)
     uint64_t start_time;  // Task start timestamp (get_sys_cnt)
     uint64_t end_time;    // Task end timestamp
@@ -117,30 +117,33 @@ struct L2PerfRecord {
     int32_t fanout_count;                 // Number of successor tasks
 } __attribute__((aligned(64)));
 
-static_assert(sizeof(L2PerfRecord) % 64 == 0, "L2PerfRecord must be 64-byte aligned for optimal cache performance");
+static_assert(
+    sizeof(L2SwimlaneAicpuTaskRecord) % 64 == 0,
+    "L2SwimlaneAicpuTaskRecord must be 64-byte aligned for optimal cache performance"
+);
 
 // =============================================================================
-// L2PerfAicoreRing - Stable AICore→AICPU Staging Ring (per core, never rotated)
+// L2SwimlaneAicoreRing - Stable AICore→AICPU Staging Ring (per core, never rotated)
 // =============================================================================
 
 /**
  * Per-core staging ring written exclusively by AICore.
  *
  * AICore stores each task's timing in `dual_issue_slots[reg_task_id %
- * PLATFORM_L2_AICORE_RING_SIZE]` and never touches any other L2Perf
+ * PLATFORM_L2_AICORE_RING_SIZE]` and never touches any other L2Swimlane
  * memory. The ring is allocated once by the host, addressed through
- * `L2PerfBufferState[block_idx].aicore_ring_ptr` (also published into the
- * `KernelArgs::aicore_l2_perf_ring_addrs` the AICore kernel entry
- * forwards into `set_aicore_l2_perf_ring()`), and lives for the entire run
+ * `L2SwimlaneAicpuTaskPool[block_idx].aicore_ring_ptr` (also published into the
+ * `KernelArgs::aicore_l2_swimlane_ring_addrs` the AICore kernel entry
+ * forwards into `set_aicore_l2_swimlane_ring()`), and lives for the entire run
  * — its address is never reassigned, decoupling AICore writes from the
  * AICPU's records-buffer rotation.
  */
-struct L2PerfAicoreRing {
-    L2PerfRecord dual_issue_slots[PLATFORM_L2_AICORE_RING_SIZE];
+struct L2SwimlaneAicoreRing {
+    L2SwimlaneAicpuTaskRecord dual_issue_slots[PLATFORM_L2_AICORE_RING_SIZE];
 } __attribute__((aligned(64)));
 
 // =============================================================================
-// L2PerfBuffer - Fixed-Size Record Buffer (AICPU-only)
+// L2SwimlaneAicpuTaskBuffer - Fixed-Size Record Buffer (AICPU-only)
 // =============================================================================
 
 /**
@@ -151,16 +154,16 @@ struct L2PerfAicoreRing {
  * by AICPU when full.
  *
  * Owned and written exclusively by AICPU: AICore never touches this memory.
- * AICPU reads timing from L2PerfAicoreRing::dual_issue_slots, fills in the
+ * AICPU reads timing from L2SwimlaneAicoreRing::dual_issue_slots, fills in the
  * AICPU-side fields, then commits into records[count++].
  */
-struct L2PerfBuffer {
-    L2PerfRecord records[PLATFORM_PROF_BUFFER_SIZE];  // Committed records (AICPU writes)
-    volatile uint32_t count;                          // Current committed record count
+struct L2SwimlaneAicpuTaskBuffer {
+    L2SwimlaneAicpuTaskRecord records[PLATFORM_PROF_BUFFER_SIZE];  // Committed records (AICPU writes)
+    volatile uint32_t count;                                       // Current committed record count
 } __attribute__((aligned(64)));
 
 // =============================================================================
-// L2PerfFreeQueue - SPSC Lock-Free Queue for Free Buffers
+// L2SwimlaneFreeQueue - SPSC Lock-Free Queue for Free Buffers
 // =============================================================================
 
 /**
@@ -178,17 +181,17 @@ struct L2PerfBuffer {
  * - Device pop: rmb() → read tail → read buffer_ptrs[head % COUNT] → rmb() → write head → wmb()
  * - Host push: write buffer_ptrs[tail % COUNT] → wmb() → write tail → wmb()
  */
-struct L2PerfFreeQueue {
+struct L2SwimlaneFreeQueue {
     volatile uint64_t buffer_ptrs[PLATFORM_PROF_SLOT_COUNT];  // Free buffer addresses
     volatile uint32_t head;                                   // Consumer read position (Device increments)
     volatile uint32_t tail;                                   // Producer write position (Host increments)
     uint32_t pad[22];                                         // Pad to 128 bytes (aligned to cache line)
 } __attribute__((aligned(64)));
 
-static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes for cache alignment");
+static_assert(sizeof(L2SwimlaneFreeQueue) == 128, "L2SwimlaneFreeQueue must be 128 bytes for cache alignment");
 
 // =============================================================================
-// L2PerfBufferState - Per-Core/Thread Buffer State (Unified for L2PerfRecord and Phase)
+// L2SwimlaneAicpuTaskPool - Per-Core/Thread Buffer State (Unified for L2SwimlaneAicpuTaskRecord and Phase)
 // =============================================================================
 
 /**
@@ -197,9 +200,9 @@ static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes
  * Contains:
  * - free_queue: SPSC queue of available buffer addresses
  * - current_buf_ptr: Currently active buffer being written (0 = no active buffer)
- * - aicore_ring_ptr: Stable per-core L2PerfAicoreRing address (L2PerfRecord
+ * - aicore_ring_ptr: Stable per-core L2SwimlaneAicoreRing address (L2SwimlaneAicpuTaskRecord
  *   profiling only; unused by Phase profiling). Set by host at init, read by
- *   AICPU in `l2_perf_aicpu_complete_record` to read the AICore-published
+ *   AICPU in `l2_swimlane_aicpu_complete_task` to read the AICore-published
  *   timing slots. Never reassigned during the run.
  * - current_buf_seq: Monotonic sequence number for ordering
  * - total_record_count / dropped_record_count / mismatch_record_count:
@@ -209,9 +212,9 @@ static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes
  *   violations (a hard error class, distinct from capacity drops).
  *
  * Used in two contexts:
- * - Per-core L2PerfRecord profiling (current_buf_ptr → L2PerfBuffer,
- *   aicore_ring_ptr → L2PerfAicoreRing)
- * - Per-thread Phase profiling (current_buf_ptr → PhaseBuffer,
+ * - Per-core L2SwimlaneAicpuTaskRecord profiling (current_buf_ptr → L2SwimlaneAicpuTaskBuffer,
+ *   aicore_ring_ptr → L2SwimlaneAicoreRing)
+ * - Per-thread Phase profiling (current_buf_ptr → L2SwimlaneAicpuPhaseBuffer,
  *   aicore_ring_ptr / mismatch_record_count unused)
  *
  * Writers:
@@ -221,10 +224,10 @@ static_assert(sizeof(L2PerfFreeQueue) == 128, "L2PerfFreeQueue must be 128 bytes
  * - current_buf_seq: Device writes (monotonic counter)
  * - aicore_ring_ptr: Host writes once at init, AICPU reads
  */
-struct L2PerfBufferState {
-    L2PerfFreeQueue free_queue;               // SPSC queue of free buffer addresses
+struct L2SwimlaneAicpuTaskPool {
+    L2SwimlaneFreeQueue free_queue;           // SPSC queue of free buffer addresses
     volatile uint64_t current_buf_ptr;        // Current active buffer (0 = none)
-    volatile uint64_t aicore_ring_ptr;        // Stable AICore staging ring (L2Perf only; 0 for Phase)
+    volatile uint64_t aicore_ring_ptr;        // Stable AICore staging ring (L2Swimlane only; 0 for Phase)
     volatile uint32_t current_buf_seq;        // Sequence number for ordering
     volatile uint32_t total_record_count;     // Records the AICPU attempted to write to this state
     volatile uint32_t dropped_record_count;   // Records dropped (queue full / overwrite / no buffer)
@@ -232,35 +235,43 @@ struct L2PerfBufferState {
     uint32_t pad[8];                          // Pad to 192 bytes (aligned to cache line)
 } __attribute__((aligned(64)));
 
-static_assert(sizeof(L2PerfBufferState) == 192, "L2PerfBufferState must be 192 bytes for cache alignment");
+static_assert(sizeof(L2SwimlaneAicpuTaskPool) == 192, "L2SwimlaneAicpuTaskPool must be 192 bytes for cache alignment");
 
 // Type alias for semantic clarity in Phase profiling context
-using PhaseBufferState = L2PerfBufferState;  // Per-thread Phase profiling
+using L2SwimlaneAicpuPhasePool = L2SwimlaneAicpuTaskPool;  // Per-thread Phase profiling
 
 // =============================================================================
 // ReadyQueueEntry - Queue Entry for Ready Buffers
 // =============================================================================
 
+/**
+ * Buffer kind for ReadyQueueEntry::kind. uint32_t underlying so the struct
+ * layout matches the prior `is_phase` field byte-for-byte. a5 currently uses
+ * only AicpuTask and AicpuPhase; AicoreTask is reserved for the AICore-as-
+ * producer migration to a5.
+ */
+enum class L2SwimlaneBufferKind : uint32_t {
+    AicpuTask = 0,   // Per-core L2SwimlaneAicpuTaskBuffer, AICPU writes
+    AicpuPhase = 1,  // Per-thread L2SwimlaneAicpuPhaseBuffer, AICPU writes
+    AicoreTask = 2,  // Reserved (mirrors a2a3)
+};
+
 /**
  * Ready queue entry
  *
  * When a buffer on a core/thread is full, AICPU adds this entry to the queue.
  * Host memory manager retrieves entries from the queue.
- *
- * Entry types (distinguished by is_phase flag):
- * - L2PerfRecord entry: core_index = core ID, is_phase = 0
- * - Phase entry:      core_index = thread_idx, is_phase = 1
  */
 struct ReadyQueueEntry {
-    uint32_t core_index;  // Core index (0 ~ num_cores-1), or thread_idx for phase entries
-    uint32_t is_phase;    // 0 = L2PerfRecord, 1 = Phase
-    uint64_t buffer_ptr;  // Device pointer to the full buffer
-    uint32_t buffer_seq;  // Sequence number for ordering
-    uint32_t pad;         // Alignment padding
+    uint32_t core_index;        // Core index (0 ~ num_cores-1), or thread_idx for phase entries
+    L2SwimlaneBufferKind kind;  // Buffer kind discriminator (uint32_t underlying)
+    uint64_t buffer_ptr;        // Device pointer to the full buffer
+    uint32_t buffer_seq;        // Sequence number for ordering
+    uint32_t pad;               // Alignment padding
 } __attribute__((aligned(32)));
 
 // =============================================================================
-// L2PerfDataHeader - Fixed Header
+// L2SwimlaneDataHeader - Fixed Header
 // =============================================================================
 
 /**
@@ -279,7 +290,7 @@ struct ReadyQueueEntry {
  * - Queue empty: head == tail
  * - Queue full: (tail + 1) % capacity == head
  */
-struct L2PerfDataHeader {
+struct L2SwimlaneDataHeader {
     // Per-thread ready queues (FIFO Circular Buffers)
     // Each AICPU thread has its own queue to avoid lock contention
     ReadyQueueEntry queues[PLATFORM_MAX_AICPU_THREADS][PLATFORM_PROF_READYQUEUE_SIZE];
@@ -287,10 +298,10 @@ struct L2PerfDataHeader {
     volatile uint32_t queue_tails[PLATFORM_MAX_AICPU_THREADS];  // Producer write positions (AICPU modifies)
 
     // Metadata (Host initializes, Device read-only)
-    uint32_t num_cores;      // Actual number of cores launched
-    uint32_t l2_perf_level;  // 0=off, 1=AICore timing, 2=+dispatch/fanout,
-                             // 3=+sched phases, 4=+orch phases. Host writes
-                             // at init; AICPU reads in l2_perf_aicpu_init.
+    uint32_t num_cores;          // Actual number of cores launched
+    uint32_t l2_swimlane_level;  // 0=off, 1=AICore timing, 2=+dispatch/fanout,
+                                 // 3=+sched phases, 4=+orch phases. Host writes
+                                 // at init; AICPU reads in l2_swimlane_aicpu_init.
 } __attribute__((aligned(64)));
 
 // =============================================================================
@@ -303,7 +314,7 @@ struct L2PerfDataHeader {
  * Scheduler phases (0-3): four phases in each scheduler loop iteration.
  * Orchestrator phases (16-24): sub-steps within each submit_task() call.
  */
-enum class AicpuPhaseId : uint32_t {
+enum class L2SwimlaneAicpuPhaseId : uint32_t {
     // Scheduler phases (0-3)
     SCHED_COMPLETE = 0,     // Process completed tasks (fanout traversal)
     SCHED_DISPATCH = 1,     // Dispatch ready tasks to idle cores
@@ -333,11 +344,11 @@ enum class AicpuPhaseId : uint32_t {
  *                   extra2 = pop_miss delta since last emit
  *   All other phases: extras are 0 (reserved for future per-phase metrics).
  */
-struct AicpuPhaseRecord {
-    uint64_t start_time;    // Phase start timestamp
-    uint64_t end_time;      // Phase end timestamp
-    uint32_t loop_iter;     // Loop iteration number
-    AicpuPhaseId phase_id;  // Phase type
+struct L2SwimlaneAicpuPhaseRecord {
+    uint64_t start_time;              // Phase start timestamp
+    uint64_t end_time;                // Phase end timestamp
+    uint32_t loop_iter;               // Loop iteration number
+    L2SwimlaneAicpuPhaseId phase_id;  // Phase type
     union {
         uint64_t task_id;          // tensormap_and_ringbuffer: full PTO2 encoding
                                    // (ring_id << 32) | local_id for cross-view correlation.
@@ -346,31 +357,31 @@ struct AicpuPhaseRecord {
     uint32_t extra1;  // Phase-specific delta (e.g. SCHED_DISPATCH = pop_hit)
     uint32_t extra2;  // Phase-specific delta (e.g. SCHED_DISPATCH = pop_miss)
 };
-static_assert(sizeof(AicpuPhaseRecord) == 40, "AicpuPhaseRecord layout drift");
+static_assert(sizeof(L2SwimlaneAicpuPhaseRecord) == 40, "L2SwimlaneAicpuPhaseRecord layout drift");
 
-constexpr uint32_t AICPU_PHASE_MAGIC = 0x41435048;  // "ACPH"
+constexpr uint32_t L2_SWIMLANE_AICPU_PHASE_MAGIC = 0x41435048;  // "ACPH"
 
 /**
- * Fixed-size phase record buffer (analogous to L2PerfBuffer)
+ * Fixed-size phase record buffer (analogous to L2SwimlaneAicpuTaskBuffer)
  *
  * Capacity: PLATFORM_PHASE_RECORDS_PER_THREAD
  * Allocated dynamically by Host, pushed into per-thread free_queue.
  */
-struct PhaseBuffer {
-    AicpuPhaseRecord records[PLATFORM_PHASE_RECORDS_PER_THREAD];
+struct L2SwimlaneAicpuPhaseBuffer {
+    L2SwimlaneAicpuPhaseRecord records[PLATFORM_PHASE_RECORDS_PER_THREAD];
     volatile uint32_t count;
 } __attribute__((aligned(64)));
 
 /**
  * AICPU phase profiling header
  *
- * Located after the L2PerfBufferState array in shared memory.
+ * Located after the L2SwimlaneAicpuTaskPool array in shared memory.
  * Contains metadata and per-thread tracking.
  */
-struct AicpuPhaseHeader {
-    uint32_t magic;                             // Validation magic (AICPU_PHASE_MAGIC)
+struct L2SwimlaneAicpuPhaseHeader {
+    uint32_t magic;                             // Validation magic (L2_SWIMLANE_AICPU_PHASE_MAGIC)
     uint32_t num_sched_threads;                 // Number of scheduler threads
-    uint32_t records_per_thread;                // Max records per PhaseBuffer
+    uint32_t records_per_thread;                // Max records per L2SwimlaneAicpuPhaseBuffer
     uint32_t num_cores;                         // Total number of cores with valid assignments
     int8_t core_to_thread[PLATFORM_MAX_CORES];  // core_id → scheduler thread index (-1 = unassigned)
 } __attribute__((aligned(64)));
@@ -387,41 +398,45 @@ extern "C" {
  * Calculate total memory size for performance data (buffer states only, no buffers)
  *
  * Formula: Total size = Fixed header + Dynamic tail
- *                     = sizeof(L2PerfDataHeader) + num_cores × sizeof(L2PerfBufferState)
+ *                     = sizeof(L2SwimlaneDataHeader) + num_cores × sizeof(L2SwimlaneAicpuTaskPool)
  *
  * @param num_cores Number of cores (block_dim × PLATFORM_CORES_PER_BLOCKDIM)
  * @return Total bytes for header + buffer states
  */
 inline size_t calc_perf_data_size(int num_cores) {
-    return sizeof(L2PerfDataHeader) + num_cores * sizeof(L2PerfBufferState);
+    return sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool);
 }
 
 /**
  * Get header pointer
  *
  * @param base_ptr Shared memory base address (device_ptr or host_ptr)
- * @return L2PerfDataHeader pointer
+ * @return L2SwimlaneDataHeader pointer
  */
-inline L2PerfDataHeader *get_l2_perf_header(void *base_ptr) { return reinterpret_cast<L2PerfDataHeader *>(base_ptr); }
+inline L2SwimlaneDataHeader *get_l2_swimlane_header(void *base_ptr) {
+    return reinterpret_cast<L2SwimlaneDataHeader *>(base_ptr);
+}
 
 /**
- * Get L2PerfBufferState array start address
+ * Get L2SwimlaneAicpuTaskPool array start address
  *
  * @param base_ptr Shared memory base address
- * @return L2PerfBufferState array pointer
+ * @return L2SwimlaneAicpuTaskPool array pointer
  */
-inline L2PerfBufferState *get_perf_buffer_states(void *base_ptr) {
-    return reinterpret_cast<L2PerfBufferState *>(reinterpret_cast<char *>(base_ptr) + sizeof(L2PerfDataHeader));
+inline L2SwimlaneAicpuTaskPool *get_perf_buffer_states(void *base_ptr) {
+    return reinterpret_cast<L2SwimlaneAicpuTaskPool *>(
+        reinterpret_cast<char *>(base_ptr) + sizeof(L2SwimlaneDataHeader)
+    );
 }
 
 /**
- * Get L2PerfBufferState for specified core
+ * Get L2SwimlaneAicpuTaskPool for specified core
  *
  * @param base_ptr Shared memory base address
  * @param core_index Core index (0 ~ num_cores-1)
- * @return L2PerfBufferState pointer
+ * @return L2SwimlaneAicpuTaskPool pointer
  */
-inline L2PerfBufferState *get_perf_buffer_state(void *base_ptr, int core_index) {
+inline L2SwimlaneAicpuTaskPool *get_perf_buffer_state(void *base_ptr, int core_index) {
     return &get_perf_buffer_states(base_ptr)[core_index];
 }
 
@@ -433,42 +448,45 @@ inline L2PerfBufferState *get_perf_buffer_state(void *base_ptr, int core_index)
  * @return Total bytes needed for header + all buffer states
  */
 inline size_t calc_perf_data_size_with_phases(int num_cores, int num_sched_threads) {
-    return calc_perf_data_size(num_cores) + sizeof(AicpuPhaseHeader) + num_sched_threads * sizeof(PhaseBufferState);
+    return calc_perf_data_size(num_cores) + sizeof(L2SwimlaneAicpuPhaseHeader) +
+           num_sched_threads * sizeof(L2SwimlaneAicpuPhasePool);
 }
 
 /**
- * Get AicpuPhaseHeader pointer (located after L2PerfBufferState array)
+ * Get L2SwimlaneAicpuPhaseHeader pointer (located after L2SwimlaneAicpuTaskPool array)
  *
  * @param base_ptr Shared memory base address
  * @param num_cores Number of AICore instances
- * @return AicpuPhaseHeader pointer
+ * @return L2SwimlaneAicpuPhaseHeader pointer
  */
-inline AicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) {
-    return reinterpret_cast<AicpuPhaseHeader *>(reinterpret_cast<char *>(base_ptr) + calc_perf_data_size(num_cores));
+inline L2SwimlaneAicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) {
+    return reinterpret_cast<L2SwimlaneAicpuPhaseHeader *>(
+        reinterpret_cast<char *>(base_ptr) + calc_perf_data_size(num_cores)
+    );
 }
 
 /**
- * Get PhaseBufferState array start address (located after AicpuPhaseHeader)
+ * Get L2SwimlaneAicpuPhasePool array start address (located after L2SwimlaneAicpuPhaseHeader)
  *
  * @param base_ptr Shared memory base address
  * @param num_cores Number of AICore instances
- * @return PhaseBufferState array pointer
+ * @return L2SwimlaneAicpuPhasePool array pointer
  */
-inline PhaseBufferState *get_phase_buffer_states(void *base_ptr, int num_cores) {
-    return reinterpret_cast<PhaseBufferState *>(
-        reinterpret_cast<char *>(get_phase_header(base_ptr, num_cores)) + sizeof(AicpuPhaseHeader)
+inline L2SwimlaneAicpuPhasePool *get_phase_buffer_states(void *base_ptr, int num_cores) {
+    return reinterpret_cast<L2SwimlaneAicpuPhasePool *>(
+        reinterpret_cast<char *>(get_phase_header(base_ptr, num_cores)) + sizeof(L2SwimlaneAicpuPhaseHeader)
     );
 }
 
 /**
- * Get PhaseBufferState for specified thread
+ * Get L2SwimlaneAicpuPhasePool for specified thread
  *
  * @param base_ptr Shared memory base address
  * @param num_cores Number of AICore instances
  * @param thread_idx Thread index
- * @return PhaseBufferState pointer
+ * @return L2SwimlaneAicpuPhasePool pointer
  */
-inline PhaseBufferState *get_phase_buffer_state(void *base_ptr, int num_cores, int thread_idx) {
+inline L2SwimlaneAicpuPhasePool *get_phase_buffer_state(void *base_ptr, int num_cores, int thread_idx) {
     return &get_phase_buffer_states(base_ptr, num_cores)[thread_idx];
 }
 
@@ -476,4 +494,4 @@ inline PhaseBufferState *get_phase_buffer_state(void *base_ptr, int num_cores, i
 }
 #endif
 
-#endif  // SRC_A5_PLATFORM_INCLUDE_COMMON_L2_PERF_PROFILING_H_
+#endif  // SRC_A5_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_
diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h
index 9d1c0cb57..75cddbf18 100644
--- a/src/a5/platform/include/common/platform_config.h
+++ b/src/a5/platform/include/common/platform_config.h
@@ -104,7 +104,7 @@ constexpr int PLATFORM_MAX_CORES = PLATFORM_MAX_BLOCKDIM * PLATFORM_CORES_PER_BL
 
 /**
  * Performance buffer capacity per buffer
- * Number of L2PerfRecord entries per dynamically allocated L2PerfBuffer
+ * Number of L2SwimlaneAicpuTaskRecord entries per dynamically allocated L2SwimlaneAicpuTaskBuffer
  */
 constexpr int PLATFORM_PROF_BUFFER_SIZE = 1000;
 
@@ -118,13 +118,13 @@ constexpr int PLATFORM_PROF_BUFFER_SIZE = 1000;
 constexpr int PLATFORM_PROF_SLOT_COUNT = 4;
 
 /**
- * L2PerfBuffer pre-allocation count per AICore.
+ * L2SwimlaneAicpuTaskBuffer pre-allocation count per AICore.
  * 1 goes into the free_queue at init, the rest into the recycled pool.
  */
 constexpr int PLATFORM_PROF_BUFFERS_PER_CORE = 8;
 
 /**
- * PhaseBuffer pre-allocation count per AICPU thread.
+ * L2SwimlaneAicpuPhaseBuffer pre-allocation count per AICPU thread.
  * 1 goes into the free_queue at init, the rest into the recycled pool.
  */
 constexpr int PLATFORM_PROF_BUFFERS_PER_THREAD = 16;
@@ -139,7 +139,7 @@ constexpr int PLATFORM_PROF_READYQUEUE_SIZE =
 
 /**
  * Performance buffer capacity per AICPU thread
- * Maximum number of AicpuPhaseRecord entries per PhaseBuffer.
+ * Maximum number of L2SwimlaneAicpuPhaseRecord entries per L2SwimlaneAicpuPhaseBuffer.
  */
 constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 500000;
 
@@ -229,14 +229,14 @@ constexpr int PLATFORM_DUMP_TIMEOUT_SECONDS = 30;
 constexpr int PLATFORM_PMU_RECORDS_PER_BUFFER = 512;
 
 /**
- * Per-core L2Perf staging ring depth (AICore-side WIP slots).
+ * Per-core L2Swimlane staging ring depth (AICore-side WIP slots).
  *
  * Must be ≥ the maximum number of in-flight tasks per core (today's
  * dual-issue dispatch keeps this at 2). The ring lives outside the
- * rotating L2PerfBuffer so AICore's write address never changes mid-run.
+ * rotating L2SwimlaneAicpuTaskBuffer so AICore's write address never changes mid-run.
  *
  * Indexing uses `task_id % PLATFORM_L2_AICORE_RING_SIZE` (see
- * `l2_perf_aicore_record_task`), so non-power-of-two values are correct
+ * `l2_swimlane_aicore_record_task`), so non-power-of-two values are correct
  * but compile to an integer divide on the AICore hot path. Prefer a power
  * of two so the compiler reduces the modulo to a mask.
  */
diff --git a/src/a5/platform/include/common/pmu_profiling.h b/src/a5/platform/include/common/pmu_profiling.h
index ad2fb39d0..680c81c83 100644
--- a/src/a5/platform/include/common/pmu_profiling.h
+++ b/src/a5/platform/include/common/pmu_profiling.h
@@ -25,7 +25,7 @@
  *
  * a5 has no halHostRegister (DAV_3510), so host↔device SPSC fields are
  * read/written via rtMemcpy (onboard) or memcpy (sim), using host shadow
- * buffers — same pattern as a5 l2_perf_collector and tensor_dump_collector.
+ * buffers — same pattern as a5 l2_swimlane_collector and tensor_dump_collector.
  */
 
 #ifndef SRC_A5_PLATFORM_INCLUDE_COMMON_PMU_PROFILING_H_
diff --git a/src/a5/platform/include/common/scope_stats.h b/src/a5/platform/include/common/scope_stats.h
index 88efa72dd..844e34089 100644
--- a/src/a5/platform/include/common/scope_stats.h
+++ b/src/a5/platform/include/common/scope_stats.h
@@ -17,7 +17,7 @@
  * scope_end — each carrying the task/heap ring start/end and the tensormap
  * live-entry count sampled at that boundary, tagged with a phase flag. Records
  * stream off the device in
- * fixed-capacity buffers, mirroring PMU / dep_gen / tensor_dump / l2_perf (the
+ * fixed-capacity buffers, mirroring PMU / dep_gen / tensor_dump / l2_swimlane (the
  * single source of mgmt-loop truth is
  * src/a2a3/platform/include/host/profiling_common/profiler_base.h):
  *
diff --git a/src/a5/platform/include/host/l2_perf_collector.h b/src/a5/platform/include/host/l2_swimlane_collector.h
similarity index 66%
rename from src/a5/platform/include/host/l2_perf_collector.h
rename to src/a5/platform/include/host/l2_swimlane_collector.h
index 2218f0952..dd5e33cc1 100644
--- a/src/a5/platform/include/host/l2_perf_collector.h
+++ b/src/a5/platform/include/host/l2_swimlane_collector.h
@@ -10,24 +10,24 @@
  */
 
 /**
- * @file l2_perf_collector.h
+ * @file l2_swimlane_collector.h
  * @brief Platform-agnostic performance data collector with dynamic memory management.
  *
  * Architecture:
- * - BufferPoolManager<L2PerfModule>: shared mgmt-thread infrastructure that
+ * - BufferPoolManager<L2SwimlaneModule>: shared mgmt-thread infrastructure that
  *   polls the AICPU ready queue, replenishes per-core / per-thread free
  *   queues, and hands full buffers off to the collector thread.
- * - L2PerfCollector: copies records from the manager's ready queue into
+ * - L2SwimlaneCollector: copies records from the manager's ready queue into
  *   host vectors and exports the swimlane visualization.
  *
  * a5 specifics: device↔host transfers go through profiling_copy.h. The
  * framework's mgmt loop mirrors the shm region per tick; per-buffer
- * payloads (L2PerfBuffer / PhaseBuffer) are pulled on demand inside
+ * payloads (L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer) are pulled on demand inside
  * ProfilerAlgorithms.
  */
 
-#ifndef SRC_A5_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_
-#define SRC_A5_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_
+#ifndef SRC_A5_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_
+#define SRC_A5_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_
 
 #include <atomic>
 #include <cstdint>
@@ -37,29 +37,29 @@
 #include <thread>
 #include <vector>
 
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 #include "host/profiling_common/profiler_base.h"
 
 // ---------------------------------------------------------------------------
-// L2 Perf profiling Module (drives BufferPoolManager<L2PerfModule>)
+// L2 Perf profiling Module (drives BufferPoolManager<L2SwimlaneModule>)
 // ---------------------------------------------------------------------------
 
 /**
  * L2 Perf has two distinct buffer kinds going through one ready queue per
  * AICPU thread:
- *   - kind 0: per-core L2PerfBuffer (task records)
- *   - kind 1: per-thread PhaseBuffer (scheduler/orchestrator phase records)
- * The ReadyQueueEntry::is_phase flag picks between them.
+ *   - kind 0: per-core L2SwimlaneAicpuTaskBuffer (task records)
+ *   - kind 1: per-thread L2SwimlaneAicpuPhaseBuffer (scheduler/orchestrator phase records)
+ * The ReadyQueueEntry::kind flag picks between them.
  */
 
 /**
  * Buffer kind discriminator carried in ReadyBufferInfo and used to index
  * the per-kind recycled pool inside BufferPoolManager.
  */
-enum class ProfBufferType { PERF_RECORD = 0, PHASE = 1 };
+enum class ProfBufferType { AICPU_TASK = 0, AICPU_PHASE = 1 };
 
 /**
  * Information about a ready (full) buffer, passed from mgmt thread to
@@ -74,16 +74,16 @@ struct ReadyBufferInfo {
     uint32_t buffer_seq;    // Sequence number for ordering
 };
 
-struct L2PerfModule {
-    using DataHeader = L2PerfDataHeader;
+struct L2SwimlaneModule {
+    using DataHeader = L2SwimlaneDataHeader;
     using ReadyEntry = ReadyQueueEntry;
     using ReadyBufferInfo = ::ReadyBufferInfo;
-    using FreeQueue = L2PerfFreeQueue;  // PhaseBufferState aliases L2PerfBufferState
+    using FreeQueue = L2SwimlaneFreeQueue;  // L2SwimlaneAicpuPhasePool aliases L2SwimlaneAicpuTaskPool
 
     static constexpr int kBufferKinds = 2;  // 0=PERF_RECORD, 1=PHASE
     static constexpr uint32_t kReadyQueueSize = PLATFORM_PROF_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_PROF_SLOT_COUNT;
-    static constexpr const char *kSubsystemName = "L2PerfModule";
+    static constexpr const char *kSubsystemName = "L2SwimlaneModule";
 
     /**
      * batch_size for proactive_replenish's alloc fallback. Sized so that a
@@ -99,39 +99,39 @@ struct L2PerfModule {
 
     static int kind_of(const ReadyBufferInfo &info) { return static_cast<int>(info.type); }
 
-    static DataHeader *header_from_shm(void *shm) { return get_l2_perf_header(shm); }
+    static DataHeader *header_from_shm(void *shm) { return get_l2_swimlane_header(shm); }
 
     /**
-     * Branch on `is_phase` to pick the per-core perf state vs. the
+     * Branch on `entry.kind` to pick the per-core perf state vs. the
      * per-thread phase state. Returns nullopt for out-of-range indices
      * (which would otherwise corrupt unrelated BufferStates downstream).
      */
-    static std::optional<profiling_common::EntrySite<L2PerfModule>>
+    static std::optional<profiling_common::EntrySite<L2SwimlaneModule>>
     resolve_entry(void *shm, DataHeader *header, int /*q*/, const ReadyEntry &entry) {
-        const bool is_phase = (entry.is_phase != 0);
+        const bool is_phase = (entry.kind == L2SwimlaneBufferKind::AicpuPhase);
         const int num_cores = static_cast<int>(header->num_cores);
 
         if (is_phase) {
             if (entry.core_index >= static_cast<uint32_t>(PLATFORM_MAX_AICPU_THREADS)) {
-                LOG_ERROR("L2PerfModule: invalid phase entry: thread=%u", entry.core_index);
+                LOG_ERROR("L2SwimlaneModule: invalid phase entry: thread=%u", entry.core_index);
                 return std::nullopt;
             }
         } else {
             if (entry.core_index >= static_cast<uint32_t>(num_cores)) {
-                LOG_ERROR("L2PerfModule: invalid perf entry: core=%u", entry.core_index);
+                LOG_ERROR("L2SwimlaneModule: invalid perf entry: core=%u", entry.core_index);
                 return std::nullopt;
             }
         }
 
-        L2PerfBufferState *state = is_phase ?
-                                       get_phase_buffer_state(shm, num_cores, static_cast<int>(entry.core_index)) :
-                                       get_perf_buffer_state(shm, static_cast<int>(entry.core_index));
+        L2SwimlaneAicpuTaskPool *state =
+            is_phase ? get_phase_buffer_state(shm, num_cores, static_cast<int>(entry.core_index)) :
+                       get_perf_buffer_state(shm, static_cast<int>(entry.core_index));
 
-        profiling_common::EntrySite<L2PerfModule> site;
-        site.kind = is_phase ? 1 : 0;
+        profiling_common::EntrySite<L2SwimlaneModule> site;
+        site.kind = static_cast<int>(entry.kind);
         site.free_queue = &state->free_queue;
-        site.buffer_size = is_phase ? sizeof(PhaseBuffer) : sizeof(L2PerfBuffer);
-        site.info.type = is_phase ? ProfBufferType::PHASE : ProfBufferType::PERF_RECORD;
+        site.buffer_size = is_phase ? sizeof(L2SwimlaneAicpuPhaseBuffer) : sizeof(L2SwimlaneAicpuTaskBuffer);
+        site.info.type = is_phase ? ProfBufferType::AICPU_PHASE : ProfBufferType::AICPU_TASK;
         site.info.index = entry.core_index;
         site.info.slot_idx = 0;
         site.info.dev_buffer_ptr = reinterpret_cast<void *>(entry.buffer_ptr);
@@ -146,17 +146,18 @@ struct L2PerfModule {
 
         // Per-core perf states (kind 0)
         for (int i = 0; i < num_cores; i++) {
-            L2PerfBufferState *state = get_perf_buffer_state(shm, i);
-            cb(/*kind=*/0, &state->free_queue, sizeof(L2PerfBuffer));
+            L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm, i);
+            cb(/*kind=*/0, &state->free_queue, sizeof(L2SwimlaneAicpuTaskBuffer));
         }
 
-        // Per-thread phase states (kind 1) — gated on AicpuPhaseHeader being
+        // Per-thread phase states (kind 1) — gated on L2SwimlaneAicpuPhaseHeader being
         // initialized (runtimes that don't emit phase records leave it zero).
-        AicpuPhaseHeader *ph = get_phase_header(shm, num_cores);
-        const int num_phase_threads = (ph->magic == AICPU_PHASE_MAGIC) ? static_cast<int>(ph->num_sched_threads) : 0;
+        L2SwimlaneAicpuPhaseHeader *ph = get_phase_header(shm, num_cores);
+        const int num_phase_threads =
+            (ph->magic == L2_SWIMLANE_AICPU_PHASE_MAGIC) ? static_cast<int>(ph->num_sched_threads) : 0;
         for (int t = 0; t < num_phase_threads; t++) {
-            PhaseBufferState *state = get_phase_buffer_state(shm, num_cores, t);
-            cb(/*kind=*/1, &state->free_queue, sizeof(PhaseBuffer));
+            L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm, num_cores, t);
+            cb(/*kind=*/1, &state->free_queue, sizeof(L2SwimlaneAicpuPhaseBuffer));
         }
     }
 };
@@ -167,13 +168,13 @@ struct L2PerfModule {
 // because they wrap stateless HAL globals. On a5 onboard the runner passes
 // register_cb=nullptr and the framework installs a malloc-shadow + DMA
 // fallback (default_host_shadow_register).
-using L2PerfAllocCallback = profiling_common::ProfAllocCallback;
-using L2PerfRegisterCallback = profiling_common::ProfRegisterCallback;
-using L2PerfUnregisterCallback = profiling_common::ProfUnregisterCallback;
-using L2PerfFreeCallback = profiling_common::ProfFreeCallback;
+using L2SwimlaneAllocCallback = profiling_common::ProfAllocCallback;
+using L2SwimlaneRegisterCallback = profiling_common::ProfRegisterCallback;
+using L2SwimlaneUnregisterCallback = profiling_common::ProfUnregisterCallback;
+using L2SwimlaneFreeCallback = profiling_common::ProfFreeCallback;
 
 // =============================================================================
-// L2PerfCollector
+// L2SwimlaneCollector
 // =============================================================================
 
 /**
@@ -192,7 +193,7 @@ using L2PerfFreeCallback = profiling_common::ProfFreeCallback;
  *                                    entries have a consumer).
  *   5. read_phase_header_metadata() — single-shot read of the
  *                                    core→thread mapping from the
- *                                    AicpuPhaseHeader.
+ *                                    L2SwimlaneAicpuPhaseHeader.
  *   6. reconcile_counters()        — leftover-active sanity check (a5 lacks
  *                                    total/dropped/mismatch counters until
  *                                    the staging-ring redesign lands).
@@ -202,31 +203,31 @@ using L2PerfFreeCallback = profiling_common::ProfFreeCallback;
  * device flush is the only data path. Any non-zero `current_buf_ptr` after
  * stop() with non-empty count is logged as a bug.
  */
-class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L2PerfModule> {
+class L2SwimlaneCollector : public profiling_common::ProfilerBase<L2SwimlaneCollector, L2SwimlaneModule> {
 public:
-    L2PerfCollector() = default;
-    ~L2PerfCollector();
+    L2SwimlaneCollector() = default;
+    ~L2SwimlaneCollector();
 
-    L2PerfCollector(const L2PerfCollector &) = delete;
-    L2PerfCollector &operator=(const L2PerfCollector &) = delete;
+    L2SwimlaneCollector(const L2SwimlaneCollector &) = delete;
+    L2SwimlaneCollector &operator=(const L2SwimlaneCollector &) = delete;
 
     // ProfilerBase contract
     static constexpr int kIdleTimeoutSec = PLATFORM_PROF_TIMEOUT_SECONDS;
-    static constexpr const char *kSubsystemName = "L2Perf";
+    static constexpr const char *kSubsystemName = "L2Swimlane";
 
     /**
      * Initialize performance profiling.
      *
      * Allocates the shared-memory region (header + per-core / per-thread
-     * BufferStates), pre-allocates initial L2PerfBuffers and PhaseBuffers,
+     * BufferStates), pre-allocates initial L2SwimlaneAicpuTaskBuffers and PhaseBuffers,
      * and seeds the per-pool free_queues + the framework's recycled pools.
      *
      * @param num_aicore     Number of AICore instances
      * @param device_id      Device ID (forwarded to register_cb)
-     * @param l2_perf_level  Collection granularity (DISABLED / AICORE_TIMING
+     * @param l2_swimlane_level  Collection granularity (DISABLED / AICORE_TIMING
      *                       / AICPU_TIMING / SCHED_PHASES / ORCH_PHASES).
-     *                       Written into `L2PerfDataHeader::l2_perf_level`
-     *                       so AICPU can promote it in `l2_perf_aicpu_init`,
+     *                       Written into `L2SwimlaneDataHeader::l2_swimlane_level`
+     *                       so AICPU can promote it in `l2_swimlane_aicpu_init`,
      *                       AND cached on the collector so
      *                       `export_swimlane_json()` can gate phase sections
      *                       and stamp the JSON `version`.
@@ -235,27 +236,27 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
      *                       host-shadow allocation via malloc)
      * @param free_cb        Device memory free callback
      * @param user_data      Opaque pointer forwarded to callbacks
-     * @param output_prefix  Per-task directory; l2_perf_records.json lands
+     * @param output_prefix  Per-task directory; l2_swimlane_records.json lands
      *                       here. Required (non-empty); CallConfig::validate()
      *                       enforces this upstream.
      * @return 0 on success, error code on failure
      */
     int initialize(
-        int num_aicore, int device_id, L2PerfLevel l2_perf_level, const L2PerfAllocCallback &alloc_cb,
-        L2PerfRegisterCallback register_cb, const L2PerfFreeCallback &free_cb, const std::string &output_prefix
+        int num_aicore, int device_id, L2SwimlaneLevel l2_swimlane_level, const L2SwimlaneAllocCallback &alloc_cb,
+        L2SwimlaneRegisterCallback register_cb, const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix
     );
 
     /**
      * Per-buffer callback invoked by ProfilerBase's poll loop. Dispatches
-     * on info.type to copy either an L2PerfBuffer (PERF_RECORD) into the
-     * per-core record vector or a PhaseBuffer (PHASE) into the per-thread
+     * on info.type to copy either an L2SwimlaneAicpuTaskBuffer (PERF_RECORD) into the
+     * per-core record vector or a L2SwimlaneAicpuPhaseBuffer (PHASE) into the per-thread
      * phase-record vector.
      */
     void on_buffer_collected(const ReadyBufferInfo &info);
 
     /**
      * Export collected records as a Chrome Trace Event JSON (swimlane view).
-     * Writes <output_prefix>/l2_perf_records.json — directory captured at
+     * Writes <output_prefix>/l2_swimlane_records.json — directory captured at
      * initialize() time.
      *
      * @return 0 on success, error code on failure
@@ -271,7 +272,7 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
      * @param user_data      Opaque pointer forwarded to callbacks
      * @return 0 on success, error code on failure
      */
-    int finalize(L2PerfUnregisterCallback unregister_cb, const L2PerfFreeCallback &free_cb);
+    int finalize(L2SwimlaneUnregisterCallback unregister_cb, const L2SwimlaneFreeCallback &free_cb);
 
     /**
      * @return true if initialize() succeeded and finalize() has not run.
@@ -279,22 +280,22 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
     bool is_initialized() const { return shm_host_ != nullptr; }
 
     /**
-     * Device pointer to the L2PerfDataHeader. Set kernel_args.l2_perf_data_base
+     * Device pointer to the L2SwimlaneDataHeader. Set kernel_args.l2_swimlane_data_base
      * to this after initialize() succeeds so the AICPU side can find the
      * shared memory.
      */
-    void *get_l2_perf_setup_device_ptr() const { return perf_shared_mem_dev_; }
+    void *get_l2_swimlane_setup_device_ptr() const { return perf_shared_mem_dev_; }
 
     /**
-     * Device pointer to the per-core L2PerfAicoreRing-address table
+     * Device pointer to the per-core L2SwimlaneAicoreRing-address table
      * (uint64_t[num_aicore]). Wire this into
-     * `KernelArgs::aicore_l2_perf_ring_addrs` so the AICore kernel
+     * `KernelArgs::aicore_l2_swimlane_ring_addrs` so the AICore kernel
      * entry forwards each core's ring pointer into platform state.
      */
     void *get_aicore_ring_addrs_device_ptr() const { return aicore_ring_addrs_dev_; }
 
     /**
-     * Read AICPU phase metadata that lives in AicpuPhaseHeader (not on the
+     * Read AICPU phase metadata that lives in L2SwimlaneAicpuPhaseHeader (not on the
      * buffer pipeline): the core→thread mapping plus a has-data signal
      * derived from accumulated per-event records. Single-shot — must be
      * called after stop() so the shm region has settled.
@@ -308,7 +309,7 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
      * un-flushed leftovers (device flush should always succeed-or-bump-
      * dropped, so a non-empty leftover indicates an AICPU flush bug).
      *
-     * NOTE: a5's L2PerfBufferState does not yet carry total/dropped/mismatch
+     * NOTE: a5's L2SwimlaneAicpuTaskPool does not yet carry total/dropped/mismatch
      * counters (they land with the AICore staging-ring redesign in a later
      * task). The full `collected + dropped + mismatch == device_total`
      * cross-check is therefore deferred. Must be called after stop().
@@ -316,9 +317,9 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
     void reconcile_counters();
 
     /**
-     * @return Per-core L2PerfRecord vectors (indexed by core_index). For tests.
+     * @return Per-core L2SwimlaneAicpuTaskRecord vectors (indexed by core_index). For tests.
      */
-    const std::vector<std::vector<L2PerfRecord>> &get_records() const { return collected_perf_records_; }
+    const std::vector<std::vector<L2SwimlaneAicpuTaskRecord>> &get_records() const { return collected_perf_records_; }
 
 private:
     // Shared memory pointers. shm_host_ / device_id_ live on ProfilerBase
@@ -326,25 +327,25 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
     void *perf_shared_mem_dev_{nullptr};
 
     // Per-core stable AICore staging rings — allocated once, never rotated.
-    // The host owns the device-side L2PerfAicoreRing buffers and the address
+    // The host owns the device-side L2SwimlaneAicoreRing buffers and the address
     // table; AICPU reads `state.aicore_ring_ptr` (set at init), and AICore
-    // reads from `KernelArgs::aicore_l2_perf_ring_addrs[block_idx]`.
+    // reads from `KernelArgs::aicore_l2_swimlane_ring_addrs[block_idx]`.
     std::vector<void *> aicore_rings_dev_;
     void *aicore_ring_addrs_dev_{nullptr};
     void *aicore_ring_addrs_host_{nullptr};
 
     int num_aicore_{0};
-    L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
 
     // Per-task output directory captured at initialize() time. Consumed by
-    // export_swimlane_json() to build <prefix>/l2_perf_records.json.
+    // export_swimlane_json() to build <prefix>/l2_swimlane_records.json.
     std::string output_prefix_;
 
     // Collected data (per-core vectors, indexed by core_index)
-    std::vector<std::vector<L2PerfRecord>> collected_perf_records_;
+    std::vector<std::vector<L2SwimlaneAicpuTaskRecord>> collected_perf_records_;
 
     // AICPU phase profiling data (per-thread, mixed sched + orch records)
-    std::vector<std::vector<AicpuPhaseRecord>> collected_phase_records_;
+    std::vector<std::vector<L2SwimlaneAicpuPhaseRecord>> collected_phase_records_;
     bool has_phase_data_{false};
 
     // Core-to-thread mapping (core_id → scheduler thread index, -1 = unassigned)
@@ -356,7 +357,7 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
     uint64_t total_perf_collected_{0};
     uint64_t total_phase_collected_{0};
 
-    // Allocate a single buffer (shm region / L2PerfBuffer / PhaseBuffer) and
+    // Allocate a single buffer (shm region / L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer) and
     // its paired host shadow.
     void *alloc_single_buffer(size_t size, void **host_ptr_out);
 
@@ -365,4 +366,4 @@ class L2PerfCollector : public profiling_common::ProfilerBase<L2PerfCollector, L
     void copy_phase_buffer(const ReadyBufferInfo &info);
 };
 
-#endif  // SRC_A5_PLATFORM_INCLUDE_HOST_L2_PERF_COLLECTOR_H_
+#endif  // SRC_A5_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_
diff --git a/src/a5/platform/include/host/profiling_common/buffer_pool_manager.h b/src/a5/platform/include/host/profiling_common/buffer_pool_manager.h
index ffa50ebc9..a033c2ebf 100644
--- a/src/a5/platform/include/host/profiling_common/buffer_pool_manager.h
+++ b/src/a5/platform/include/host/profiling_common/buffer_pool_manager.h
@@ -11,7 +11,7 @@
 
 /**
  * @file buffer_pool_manager.h
- * @brief Generic buffer-pool data structure shared by L2Perf, TensorDump,
+ * @brief Generic buffer-pool data structure shared by L2Swimlane, TensorDump,
  *        and PMU collectors. Owns:
  *
  *   - ready_queue (mgmt → collector) with mutex/cv,
@@ -44,7 +44,7 @@
  *      `mirror_shm_to_device` is kept for init/teardown but is NOT used by
  *      the mgmt loop — bulk write-back races with AICPU writes to
  *      device-only fields (current_buf_ptr, total/dropped/mismatch
- *      counters, queue_tails, free_queue.head, AicpuPhaseHeader::magic).
+ *      counters, queue_tails, free_queue.head, L2SwimlaneAicpuPhaseHeader::magic).
  *   2. `reg` allocates a paired host shadow (instead of mapping a HAL view
  *      onto the device pointer); `release_owned_buffers` therefore frees
  *      both the device pointer (via `release_fn`) and the host shadow
@@ -267,7 +267,7 @@ class BufferPoolManager {
      *
      * NOTE: deprecated for a5 — bulk write_back races with AICPU writes to
      * device-owned fields (BufferState::current_buf_ptr, total/dropped/mismatch
-     * counters, queue_tails, free_queue.head, AicpuPhaseHeader::magic, ...).
+     * counters, queue_tails, free_queue.head, L2SwimlaneAicpuPhaseHeader::magic, ...).
      * The bulk write rolls those updates back to whatever was in the host
      * shadow at mirror_from_device time. Keep the method around so callers
      * outside the mgmt loop (init/teardown) still have a way to push the
@@ -352,7 +352,7 @@ class BufferPoolManager {
     }
 
     /**
-     * Pull a single buffer's contents (e.g. an L2PerfBuffer / PmuBuffer /
+     * Pull a single buffer's contents (e.g. an L2SwimlaneAicpuTaskBuffer / PmuBuffer /
      * DumpMetaBuffer) from device to its host shadow. Called by
      * ProfilerAlgorithms::process_entry after resolving the host pointer
      * for a popped ready entry, before delivering it to the collector.
@@ -548,7 +548,7 @@ class BufferPoolManager {
     // dev → host mapping (single source of truth for resolve_host_ptr)
     std::unordered_map<void *, void *> dev_to_host_;
 
-    // Per-kind recycled buffer pools (vector indexed by Module's BufferKind id)
+    // Per-kind recycled buffer pools (vector indexed by Module-defined kind id)
     std::vector<std::vector<void *>> recycled_;
 };
 
diff --git a/src/a5/platform/include/host/profiling_common/profiler_base.h b/src/a5/platform/include/host/profiling_common/profiler_base.h
index 94ebcad87..496fa9ebb 100644
--- a/src/a5/platform/include/host/profiling_common/profiler_base.h
+++ b/src/a5/platform/include/host/profiling_common/profiler_base.h
@@ -11,7 +11,7 @@
 
 /**
  * @file profiler_base.h
- * @brief CRTP scaffolding shared by L2Perf / Dump / PMU collectors.
+ * @brief CRTP scaffolding shared by L2Swimlane / Dump / PMU collectors.
  *
  * Owns the BufferPoolManager<Module>, the mgmt thread (which polls AICPU
  * ready queues and recycles buffers), and the collector poll thread.
@@ -19,12 +19,12 @@
  * Module concept contract
  * -----------------------
  *
- * Each profiling subsystem provides a `Module` struct (e.g., L2PerfModule,
+ * Each profiling subsystem provides a `Module` struct (e.g., L2SwimlaneModule,
  * DumpModule, PmuModule) that supplies the data-layout traits the unified
  * mgmt-loop algorithms (ProfilerAlgorithms<Module>) need. Required members:
  *
  *   // Types
- *   using DataHeader      = ...;   // Shared-memory header (e.g. L2PerfDataHeader).
+ *   using DataHeader      = ...;   // Shared-memory header (e.g. L2SwimlaneDataHeader).
  *   using ReadyEntry      = ...;   // Per-AICPU-thread ready-queue entry.
  *   using ReadyBufferInfo = ...;   // Hand-off struct to the collector thread
  *                                  // (carries dev/host ptrs, optional kind
@@ -34,10 +34,10 @@
  *                                  // `buffer_ptrs[kSlotCount]`.
  *
  *   // Constants
- *   static constexpr int      kBufferKinds;    // L2Perf=2 (perf+phase), Dump=1, PMU=1.
+ *   static constexpr int      kBufferKinds;    // L2Swimlane=2 (perf+phase), Dump=1, PMU=1.
  *   static constexpr uint32_t kReadyQueueSize; // Per-thread ready-queue depth.
  *   static constexpr uint32_t kSlotCount;      // FreeQueue::buffer_ptrs[] length.
- *   static constexpr const char* kSubsystemName; // "PMU" / "L2Perf" / "Dump".
+ *   static constexpr const char* kSubsystemName; // "PMU" / "L2Swimlane" / "Dump".
  *
  *   // Header pointer cast (host_ptr → DataHeader*)
  *   static DataHeader* header_from_shm(void* shared_mem_host);
@@ -115,7 +115,7 @@
  *     `write_range_to_device` writes. The bulk `mirror_shm_to_device` is
  *     intentionally NOT called from mgmt_loop: it raced with AICPU writes
  *     to device-only fields (current_buf_ptr, total/dropped/mismatch
- *     counters, queue_tails, free_queue.head, AicpuPhaseHeader::magic) and
+ *     counters, queue_tails, free_queue.head, L2SwimlaneAicpuPhaseHeader::magic) and
  *     rolled them back to the host-shadow values mirrored in at the top of
  *     the tick. Buffer contents are mirrored on demand inside
  *     ProfilerAlgorithms.
@@ -138,7 +138,7 @@
  *       (use the subsystem's PLATFORM_*_TIMEOUT_SECONDS).
  *
  *   static constexpr const char*  kSubsystemName;
- *       Used in the idle-timeout log line (e.g. "L2Perf", "PMU", "TensorDump").
+ *       Used in the idle-timeout log line (e.g. "L2Swimlane", "PMU", "TensorDump").
  */
 
 #ifndef SRC_A5_PLATFORM_INCLUDE_HOST_PROFILING_COMMON_PROFILER_BASE_H_
@@ -162,7 +162,7 @@
 namespace profiling_common {
 
 // Common subsystem callback signatures. All four collectors (PMU / TensorDump
-// / L2Perf / DepGen) used to declare their own typedefs with identical
+// / L2Swimlane / DepGen) used to declare their own typedefs with identical
 // shapes; these are the canonical types stashed in ProfilerBase via
 // set_memory_context().
 //
@@ -590,7 +590,7 @@ class ProfilerBase {
      *
      * The bulk `mirror_shm_to_device` deliberately is NOT called: it races
      * with AICPU writes to device-only fields (current_buf_ptr, total/dropped/
-     * mismatch counters, queue_tails, free_queue.head, AicpuPhaseHeader::magic,
+     * mismatch counters, queue_tails, free_queue.head, L2SwimlaneAicpuPhaseHeader::magic,
      * core_to_thread[]) and rolls them back to whatever was mirrored in at
      * the start of the tick. Each host-side modification is written back as
      * a narrow field write inside Alg.
diff --git a/src/a5/platform/onboard/aicore/kernel.cpp b/src/a5/platform/onboard/aicore/kernel.cpp
index 6789b66b0..d53305f4e 100644
--- a/src/a5/platform/onboard/aicore/kernel.cpp
+++ b/src/a5/platform/onboard/aicore/kernel.cpp
@@ -15,7 +15,7 @@
 #include "aicore/aicore_profiling_state.h"
 #include "common/core_type.h"
 #include "common/kernel_args.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "common/pmu_profiling.h"
 
@@ -47,17 +47,19 @@ class Runtime;
 // linker dedup the otherwise-duplicate symbol definitions across the two
 // compilation units.
 [[block_local]] static uint32_t s_aicore_profiling_flag;
-[[block_local]] static __gm__ L2PerfAicoreRing *s_aicore_l2_perf_ring;
+[[block_local]] static __gm__ L2SwimlaneAicoreRing *s_aicore_l2_swimlane_ring;
 [[block_local]] static __gm__ PmuAicoreRing *s_aicore_pmu_ring;
 [[block_local]] static uint64_t s_aicore_pmu_reg_base;
 
 __attribute__((weak)) __aicore__ void set_aicore_profiling_flag(uint32_t flag) { s_aicore_profiling_flag = flag; }
 __attribute__((weak)) __aicore__ uint32_t get_aicore_profiling_flag() { return s_aicore_profiling_flag; }
 
-__attribute__((weak)) __aicore__ void set_aicore_l2_perf_ring(__gm__ L2PerfAicoreRing *ring) {
-    s_aicore_l2_perf_ring = ring;
+__attribute__((weak)) __aicore__ void set_aicore_l2_swimlane_ring(__gm__ L2SwimlaneAicoreRing *ring) {
+    s_aicore_l2_swimlane_ring = ring;
+}
+__attribute__((weak)) __aicore__ __gm__ L2SwimlaneAicoreRing *get_aicore_l2_swimlane_ring() {
+    return s_aicore_l2_swimlane_ring;
 }
-__attribute__((weak)) __aicore__ __gm__ L2PerfAicoreRing *get_aicore_l2_perf_ring() { return s_aicore_l2_perf_ring; }
 
 __attribute__((weak)) __aicore__ void set_aicore_pmu_ring(__gm__ PmuAicoreRing *ring) { s_aicore_pmu_ring = ring; }
 __attribute__((weak)) __aicore__ __gm__ PmuAicoreRing *get_aicore_pmu_ring() { return s_aicore_pmu_ring; }
@@ -80,7 +82,7 @@ extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, Co
  *
  * Each core (AIC or AIV) gets its own handshake buffer indexed by block_idx.
  * Profiling state flows from KernelArgs into platform-owned per-core slots
- * via set_aicore_profiling_flag() / set_aicore_l2_perf_ring() /
+ * via set_aicore_profiling_flag() / set_aicore_l2_swimlane_ring() /
  * set_aicore_pmu_ring() / set_aicore_pmu_reg_base(); the runtime's
  * Handshake stays profiling-free and aicore_execute keeps its original
  * signature.
@@ -105,14 +107,14 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ KernelA
     // does not depend on any AICPU init ordering.
     set_aicore_profiling_flag(k_args->enable_profiling_flag);
     if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)) {
-        __gm__ uint64_t *ring_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_l2_perf_ring_addrs);
+        __gm__ uint64_t *ring_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_l2_swimlane_ring_addrs);
         if (ring_table != nullptr) {
-            set_aicore_l2_perf_ring(reinterpret_cast<__gm__ L2PerfAicoreRing *>(ring_table[block_idx]));
+            set_aicore_l2_swimlane_ring(reinterpret_cast<__gm__ L2SwimlaneAicoreRing *>(ring_table[block_idx]));
         } else {
-            set_aicore_l2_perf_ring(nullptr);
+            set_aicore_l2_swimlane_ring(nullptr);
         }
     } else {
-        set_aicore_l2_perf_ring(nullptr);
+        set_aicore_l2_swimlane_ring(nullptr);
     }
     if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_PMU)) {
         __gm__ uint64_t *pmu_ring_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_pmu_ring_addrs);
diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp
index 0f18c9909..1761b8a64 100644
--- a/src/a5/platform/onboard/aicpu/kernel.cpp
+++ b/src/a5/platform/onboard/aicpu/kernel.cpp
@@ -15,7 +15,7 @@
 #include "common/platform_config.h"
 #include "aicpu/device_log.h"
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
 #include "aicpu/platform_aicpu_affinity.h"
 #include "aicpu/pmu_collector_aicpu.h"
@@ -105,7 +105,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a
     set_orch_device_id(static_cast<int>(k_args->device_id));
     set_platform_dump_base(k_args->dump_data_base);
     set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR));
-    set_platform_l2_perf_base(k_args->l2_perf_data_base);
+    set_platform_l2_swimlane_base(k_args->l2_swimlane_data_base);
     set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE));
     set_platform_pmu_base(k_args->pmu_data_base);
     set_pmu_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_PMU));
diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt
index d535ced0a..d613f0b46 100644
--- a/src/a5/platform/onboard/host/CMakeLists.txt
+++ b/src/a5/platform/onboard/host/CMakeLists.txt
@@ -43,7 +43,7 @@ list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_swimlane_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/scope_stats_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp"
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 122d0181a..80199a3f9 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -108,9 +108,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     // Initialize per-subsystem shared memory.
     if (enable_l2_swimlane_) {
-        rc = init_l2_perf(num_aicore, device_id_);
+        rc = init_l2_swimlane(num_aicore, device_id_);
         if (rc != 0) {
-            LOG_ERROR("init_l2_perf failed: %d", rc);
+            LOG_ERROR("init_l2_swimlane failed: %d", rc);
             return rc;
         }
     }
@@ -219,8 +219,8 @@ int DeviceRunner::finalize() {
     // shadows). All four shared collectors use the same alloc/free shape
     // on a5: no unregister callback (a5 doesn't use halHostRegister) +
     // prof_free_cb (rtFree directly).
-    if (l2_perf_collector_.is_initialized()) {
-        l2_perf_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
+    if (l2_swimlane_collector_.is_initialized()) {
+        l2_swimlane_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
     }
     if (dump_collector_.is_initialized()) {
         dump_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
@@ -252,8 +252,8 @@ int DeviceRunner::finalize() {
 // `launch_aicpu_kernel` and `launch_aicore_kernel` live on `DeviceRunnerBase`.
 
 void DeviceRunner::finalize_collectors() {
-    if (l2_perf_collector_.is_initialized()) {
-        l2_perf_collector_.stop();
+    if (l2_swimlane_collector_.is_initialized()) {
+        l2_swimlane_collector_.stop();
     }
     if (dump_collector_.is_initialized()) {
         dump_collector_.stop();
@@ -263,15 +263,15 @@ void DeviceRunner::finalize_collectors() {
     }
 }
 
-int DeviceRunner::init_l2_perf(int num_aicore, int device_id) {
-    int rc = l2_perf_collector_.initialize(
-        num_aicore, device_id, l2_perf_level_, prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, output_prefix_
+int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) {
+    int rc = l2_swimlane_collector_.initialize(
+        num_aicore, device_id, l2_swimlane_level_, prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, output_prefix_
     );
     if (rc == 0) {
-        kernel_args_.args.l2_perf_data_base =
-            reinterpret_cast<uint64_t>(l2_perf_collector_.get_l2_perf_setup_device_ptr());
-        kernel_args_.args.aicore_l2_perf_ring_addrs =
-            reinterpret_cast<uint64_t>(l2_perf_collector_.get_aicore_ring_addrs_device_ptr());
+        kernel_args_.args.l2_swimlane_data_base =
+            reinterpret_cast<uint64_t>(l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr());
+        kernel_args_.args.aicore_l2_swimlane_ring_addrs =
+            reinterpret_cast<uint64_t>(l2_swimlane_collector_.get_aicore_ring_addrs_device_ptr());
     }
     return rc;
 }
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index aaf7a6b30..332169b00 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -43,12 +43,12 @@
 #include "device_runner_helpers.h"  // common DeviceArgs + KernelArgsHelper
 #include "common/kernel_args.h"
 #include "common/memory_barrier.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 #include "host/function_cache.h"
 #include "host/memory_allocator.h"
-#include "host/l2_perf_collector.h"
+#include "host/l2_swimlane_collector.h"
 #include "host/pmu_collector.h"
 #include "host/scope_stats_collector.h"
 #include "host/tensor_dump_collector.h"
@@ -141,7 +141,7 @@ class DeviceRunner : public DeviceRunnerBase {
     // (`ChipCallableBuffer`, `CallableState`, `OrchSoBuffer`) are
     // inherited from `DeviceRunnerBase`.
 
-    // Shared collectors (`l2_perf_collector_`, `dump_collector_`,
+    // Shared collectors (`l2_swimlane_collector_`, `dump_collector_`,
     // `pmu_collector_`, `scope_stats_collector_`) live on `DeviceRunnerBase`.
 
     // `query_max_block_dim`, `validate_block_dim`, `ensure_binaries_loaded`,
@@ -151,16 +151,16 @@ class DeviceRunner : public DeviceRunnerBase {
     /**
      * Initialize performance profiling device buffers
      *
-     * Allocates L2PerfSetupHeader and per-core/per-thread buffers on device;
-     * caller publishes the device pointer via kernel_args.l2_perf_data_base
-     * (AICPU reads it through get_platform_l2_perf_base()).
+     * Allocates L2SwimlaneSetupHeader and per-core/per-thread buffers on device;
+     * caller publishes the device pointer via kernel_args.l2_swimlane_data_base
+     * (AICPU reads it through get_platform_l2_swimlane_base()).
      *
      * @param runtime Runtime instance to configure
      * @param num_aicore Number of AICore instances
      * @param device_id Device ID
      * @return 0 on success, error code on failure
      */
-    int init_l2_perf(int num_aicore, int device_id);
+    int init_l2_swimlane(int num_aicore, int device_id);
 
     /**
      * Initialize tensor dump device buffers.
@@ -180,7 +180,7 @@ class DeviceRunner : public DeviceRunnerBase {
      * Signature matches a2a3 for cross-platform consistency.
      */
     // Shared enable flags (`enable_l2_swimlane_`, `enable_dump_tensor_`,
-    // `enable_pmu_`, `enable_scope_stats_`, `l2_perf_level_`,
+    // `enable_pmu_`, `enable_scope_stats_`, `l2_swimlane_level_`,
     // `pmu_event_type_`, `output_prefix_`) live on `DeviceRunnerBase`.
 
     int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id);
diff --git a/src/a5/platform/sim/aicore/inner_kernel.h b/src/a5/platform/sim/aicore/inner_kernel.h
index 42151f020..46c05f18c 100644
--- a/src/a5/platform/sim/aicore/inner_kernel.h
+++ b/src/a5/platform/sim/aicore/inner_kernel.h
@@ -38,12 +38,12 @@
 //   - with CACHELINE_OUT: write-back/flush (write to memory) -> release semantics
 // On aarch64, acquire-only fences do NOT prevent store-store reordering across the
 // barrier, so using acquire for the flush direction causes a race: the AICPU can
-// observe the COND register FIN signal before l2_perf_buf->count is visible.
+// observe the COND register FIN signal before l2_swimlane_buf->count is visible.
 // Using seq_cst (dmb ish / full barrier) covers both directions safely.
 // Use variadic macro to support both 2-arg and 3-arg calls.
 #define dcci(...) std::atomic_thread_fence(std::memory_order_seq_cst)
 
-// dsb / mem_dsb_t — CANN provides these on real AICore; l2_perf_collector uses them after dcci flush.
+// dsb / mem_dsb_t — CANN provides these on real AICore; l2_swimlane_collector uses them after dcci flush.
 // Simulation: full fence (same strength as dcci above) so AICPU ordering matches hardware intent.
 typedef int mem_dsb_t;
 #define dsb(_kind)                                           \
diff --git a/src/a5/platform/sim/aicore/kernel.cpp b/src/a5/platform/sim/aicore/kernel.cpp
index a81ff2d9b..414a2e125 100644
--- a/src/a5/platform/sim/aicore/kernel.cpp
+++ b/src/a5/platform/sim/aicore/kernel.cpp
@@ -23,7 +23,7 @@
 #include "aicore/aicore.h"
 #include "aicore/aicore_profiling_state.h"
 #include "common/core_type.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "common/pmu_profiling.h"
 #include "runtime.h"
@@ -35,7 +35,7 @@ static pthread_key_t g_reg_base_key;
 static pthread_key_t g_core_id_key;
 static pthread_key_t g_block_idx_key;
 static pthread_key_t g_aicore_profiling_flag_key;
-static pthread_key_t g_aicore_l2_perf_ring_key;
+static pthread_key_t g_aicore_l2_swimlane_ring_key;
 static pthread_key_t g_aicore_pmu_ring_key;
 static pthread_key_t g_pmu_reg_base_key;
 static pthread_once_t g_tls_once = PTHREAD_ONCE_INIT;
@@ -45,7 +45,7 @@ static void create_tls_keys() {
     pthread_key_create(&g_core_id_key, nullptr);
     pthread_key_create(&g_block_idx_key, nullptr);
     pthread_key_create(&g_aicore_profiling_flag_key, nullptr);
-    pthread_key_create(&g_aicore_l2_perf_ring_key, nullptr);
+    pthread_key_create(&g_aicore_l2_swimlane_ring_key, nullptr);
     pthread_key_create(&g_aicore_pmu_ring_key, nullptr);
     pthread_key_create(&g_pmu_reg_base_key, nullptr);
 }
@@ -68,11 +68,11 @@ __aicore__ uint32_t get_aicore_profiling_flag() {
     return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(pthread_getspecific(g_aicore_profiling_flag_key)));
 }
 
-__aicore__ void set_aicore_l2_perf_ring(__gm__ L2PerfAicoreRing *ring) {
-    pthread_setspecific(g_aicore_l2_perf_ring_key, reinterpret_cast<void *>(ring));
+__aicore__ void set_aicore_l2_swimlane_ring(__gm__ L2SwimlaneAicoreRing *ring) {
+    pthread_setspecific(g_aicore_l2_swimlane_ring_key, reinterpret_cast<void *>(ring));
 }
-__aicore__ __gm__ L2PerfAicoreRing *get_aicore_l2_perf_ring() {
-    return reinterpret_cast<__gm__ L2PerfAicoreRing *>(pthread_getspecific(g_aicore_l2_perf_ring_key));
+__aicore__ __gm__ L2SwimlaneAicoreRing *get_aicore_l2_swimlane_ring() {
+    return reinterpret_cast<__gm__ L2SwimlaneAicoreRing *>(pthread_getspecific(g_aicore_l2_swimlane_ring_key));
 }
 
 __aicore__ void set_aicore_pmu_ring(__gm__ PmuAicoreRing *ring) {
@@ -111,7 +111,7 @@ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type);
 // executor with its original signature.
 extern "C" void aicore_execute_wrapper(
     __gm__ Runtime *runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs,
-    uint32_t enable_profiling_flag, uint64_t aicore_l2_perf_ring_addrs, uint64_t aicore_pmu_ring_addrs
+    uint32_t enable_profiling_flag, uint64_t aicore_l2_swimlane_ring_addrs, uint64_t aicore_pmu_ring_addrs
 ) {
     pthread_once(&g_tls_once, create_tls_keys);
 
@@ -130,11 +130,11 @@ extern "C" void aicore_execute_wrapper(
 
     // Publish per-core profiling state before the executor runs.
     set_aicore_profiling_flag(enable_profiling_flag);
-    if ((enable_profiling_flag & PROFILING_FLAG_L2_SWIMLANE) && aicore_l2_perf_ring_addrs != 0) {
-        uint64_t *ring_table = reinterpret_cast<uint64_t *>(aicore_l2_perf_ring_addrs);
-        set_aicore_l2_perf_ring(reinterpret_cast<__gm__ L2PerfAicoreRing *>(ring_table[block_idx]));
+    if ((enable_profiling_flag & PROFILING_FLAG_L2_SWIMLANE) && aicore_l2_swimlane_ring_addrs != 0) {
+        uint64_t *ring_table = reinterpret_cast<uint64_t *>(aicore_l2_swimlane_ring_addrs);
+        set_aicore_l2_swimlane_ring(reinterpret_cast<__gm__ L2SwimlaneAicoreRing *>(ring_table[block_idx]));
     } else {
-        set_aicore_l2_perf_ring(nullptr);
+        set_aicore_l2_swimlane_ring(nullptr);
     }
     if ((enable_profiling_flag & PROFILING_FLAG_PMU) && aicore_pmu_ring_addrs != 0) {
         uint64_t *pmu_ring_table = reinterpret_cast<uint64_t *>(aicore_pmu_ring_addrs);
diff --git a/src/a5/platform/sim/host/CMakeLists.txt b/src/a5/platform/sim/host/CMakeLists.txt
index 88b9cd32f..217d94a6a 100644
--- a/src/a5/platform/sim/host/CMakeLists.txt
+++ b/src/a5/platform/sim/host/CMakeLists.txt
@@ -44,7 +44,7 @@ list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/profiling_copy.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_perf_collector.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/l2_swimlane_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/scope_stats_collector.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp"
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index 85b79fbdf..1cf0bcef8 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -47,7 +47,7 @@
 typedef int (*aicpu_execute_func_t)(Runtime *runtime);
 typedef void (*aicore_execute_func_t)(
     Runtime *runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs,
-    uint32_t enable_profiling_flag, uint64_t aicore_l2_perf_ring_addrs, uint64_t aicore_pmu_ring_addrs
+    uint32_t enable_profiling_flag, uint64_t aicore_l2_swimlane_ring_addrs, uint64_t aicore_pmu_ring_addrs
 );
 typedef void (*set_platform_regs_func_t)(uint64_t regs);
 
@@ -253,10 +253,10 @@ int DeviceRunner::ensure_binaries_loaded() {
             return -1;
         }
 
-        set_platform_l2_perf_base_func_ =
-            reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_l2_perf_base"));
-        if (set_platform_l2_perf_base_func_ == nullptr) {
-            LOG_ERROR("dlsym failed for set_platform_l2_perf_base: %s", dlerror());
+        set_platform_l2_swimlane_base_func_ =
+            reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_l2_swimlane_base"));
+        if (set_platform_l2_swimlane_base_func_ == nullptr) {
+            LOG_ERROR("dlsym failed for set_platform_l2_swimlane_base: %s", dlerror());
             return -1;
         }
 
@@ -478,9 +478,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     // Initialize per-subsystem shared memory.
     if (enable_l2_swimlane_) {
-        rc = init_l2_perf(num_aicore, device_id_);
+        rc = init_l2_swimlane(num_aicore, device_id_);
         if (rc != 0) {
-            LOG_ERROR("init_l2_perf failed: %d", rc);
+            LOG_ERROR("init_l2_swimlane failed: %d", rc);
             return rc;
         }
     }
@@ -567,7 +567,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     set_platform_regs_func_(kernel_args_.regs);
     set_platform_dump_base_func_(kernel_args_.dump_data_base);
     set_dump_tensor_enabled_func_(enable_dump_tensor_);
-    set_platform_l2_perf_base_func_(kernel_args_.l2_perf_data_base);
+    set_platform_l2_swimlane_base_func_(kernel_args_.l2_swimlane_data_base);
     set_l2_swimlane_enabled_func_(enable_l2_swimlane_);
     set_platform_pmu_base_func_(kernel_args_.pmu_data_base);
     set_pmu_enabled_func_(enable_pmu_);
@@ -586,7 +586,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return create_thread(std::move(fn));
     };
     if (enable_l2_swimlane_) {
-        l2_perf_collector_.start(thread_factory);
+        l2_swimlane_collector_.start(thread_factory);
     }
     if (enable_dump_tensor_) {
         dump_collector_.start(thread_factory);
@@ -640,7 +640,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         aicore_threads.push_back(create_thread([this, &runtime, i, core_type, physical_core_id]() {
             aicore_execute_func_(
                 &runtime, i, core_type, physical_core_id, kernel_args_.regs, kernel_args_.enable_profiling_flag,
-                kernel_args_.aicore_l2_perf_ring_addrs, kernel_args_.aicore_pmu_ring_addrs
+                kernel_args_.aicore_l2_swimlane_ring_addrs, kernel_args_.aicore_pmu_ring_addrs
             );
         }));
     }
@@ -674,10 +674,10 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     // directory the user set on CallConfig (validate() enforces non-empty
     // upstream).
     if (enable_l2_swimlane_) {
-        l2_perf_collector_.stop();
-        l2_perf_collector_.read_phase_header_metadata();
-        l2_perf_collector_.reconcile_counters();
-        l2_perf_collector_.export_swimlane_json();
+        l2_swimlane_collector_.stop();
+        l2_swimlane_collector_.read_phase_header_metadata();
+        l2_swimlane_collector_.reconcile_counters();
+        l2_swimlane_collector_.export_swimlane_json();
     }
 
     if (enable_dump_tensor_) {
@@ -739,7 +739,7 @@ void DeviceRunner::unload_executor_binaries() {
         set_platform_regs_func_ = nullptr;
         set_platform_dump_base_func_ = nullptr;
         set_dump_tensor_enabled_func_ = nullptr;
-        set_platform_l2_perf_base_func_ = nullptr;
+        set_platform_l2_swimlane_base_func_ = nullptr;
         set_l2_swimlane_enabled_func_ = nullptr;
         set_platform_pmu_base_func_ = nullptr;
         set_pmu_enabled_func_ = nullptr;
@@ -939,8 +939,8 @@ int DeviceRunner::finalize() {
     }
 
     // Cleanup all profiling subsystems.
-    if (l2_perf_collector_.is_initialized()) {
-        l2_perf_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
+    if (l2_swimlane_collector_.is_initialized()) {
+        l2_swimlane_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
     }
     if (dump_collector_.is_initialized()) {
         dump_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
@@ -1119,8 +1119,8 @@ uint64_t DeviceRunner::upload_chip_callable_buffer(const ChipCallable *callable)
 // =============================================================================
 
 void DeviceRunner::finalize_collectors() {
-    if (l2_perf_collector_.is_initialized()) {
-        l2_perf_collector_.stop();
+    if (l2_swimlane_collector_.is_initialized()) {
+        l2_swimlane_collector_.stop();
     }
     if (dump_collector_.is_initialized()) {
         dump_collector_.stop();
@@ -1130,14 +1130,15 @@ void DeviceRunner::finalize_collectors() {
     }
 }
 
-int DeviceRunner::init_l2_perf(int num_aicore, int device_id) {
-    int rc = l2_perf_collector_.initialize(
-        num_aicore, device_id, l2_perf_level_, prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, output_prefix_
+int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) {
+    int rc = l2_swimlane_collector_.initialize(
+        num_aicore, device_id, l2_swimlane_level_, prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, output_prefix_
     );
     if (rc == 0) {
-        kernel_args_.l2_perf_data_base = reinterpret_cast<uint64_t>(l2_perf_collector_.get_l2_perf_setup_device_ptr());
-        kernel_args_.aicore_l2_perf_ring_addrs =
-            reinterpret_cast<uint64_t>(l2_perf_collector_.get_aicore_ring_addrs_device_ptr());
+        kernel_args_.l2_swimlane_data_base =
+            reinterpret_cast<uint64_t>(l2_swimlane_collector_.get_l2_swimlane_setup_device_ptr());
+        kernel_args_.aicore_l2_swimlane_ring_addrs =
+            reinterpret_cast<uint64_t>(l2_swimlane_collector_.get_aicore_ring_addrs_device_ptr());
     }
     return rc;
 }
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 57caec5e7..2899d35a2 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -47,12 +47,12 @@
 #include "common/core_type.h"
 #include "common/kernel_args.h"
 #include "common/memory_barrier.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 #include "host/function_cache.h"
 #include "host/memory_allocator.h"
-#include "host/l2_perf_collector.h"
+#include "host/l2_swimlane_collector.h"
 #include "host/pmu_collector.h"
 #include "host/scope_stats_collector.h"
 #include "host/tensor_dump_collector.h"
@@ -184,8 +184,8 @@ class DeviceRunner {
      * Runtime struct / run() arg list so all three travel the same way.
      */
     void set_l2_swimlane_enabled(int level) {
-        l2_perf_level_ = static_cast<L2PerfLevel>(level);
-        enable_l2_swimlane_ = (l2_perf_level_ != L2PerfLevel::DISABLED);
+        l2_swimlane_level_ = static_cast<L2SwimlaneLevel>(level);
+        enable_l2_swimlane_ = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
     }
     void set_dump_tensor_enabled(bool enable) { enable_dump_tensor_ = enable; }
     void set_pmu_enabled(int enable_pmu) {
@@ -193,7 +193,7 @@ class DeviceRunner {
         pmu_event_type_ = resolve_pmu_event_type(enable_pmu);
     }
     void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; }
-    // Directory under which all diagnostic artifacts (l2_perf_records.json /
+    // Directory under which all diagnostic artifacts (l2_swimlane_records.json /
     // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic
     // is enabled; CallConfig::validate() enforces this contract upstream.
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
@@ -377,7 +377,7 @@ class DeviceRunner {
     void (*set_platform_dump_base_func_)(uint64_t){nullptr};
     void (*set_platform_pmu_base_func_)(uint64_t){nullptr};
     void (*set_dump_tensor_enabled_func_)(bool){nullptr};
-    void (*set_platform_l2_perf_base_func_)(uint64_t){nullptr};
+    void (*set_platform_l2_swimlane_base_func_)(uint64_t){nullptr};
     void (*set_l2_swimlane_enabled_func_)(bool){nullptr};
     void (*set_pmu_enabled_func_)(bool){nullptr};
     void (*set_scope_stats_enabled_func_)(bool){nullptr};
@@ -386,7 +386,7 @@ class DeviceRunner {
     std::string aicore_so_path_;
 
     // Performance profiling
-    L2PerfCollector l2_perf_collector_;
+    L2SwimlaneCollector l2_swimlane_collector_;
 
     // Tensor dump (independent from profiling)
     TensorDumpCollector dump_collector_;
@@ -417,7 +417,7 @@ class DeviceRunner {
      * @param device_id Device ID (ignored in simulation)
      * @return 0 on success, error code on failure
      */
-    int init_l2_perf(int num_aicore, int device_id);
+    int init_l2_swimlane(int num_aicore, int device_id);
 
     /**
      * Initialize tensor dump for simulation.
@@ -439,9 +439,9 @@ class DeviceRunner {
     bool enable_dump_tensor_{false};
     bool enable_pmu_{false};
     bool enable_scope_stats_{false};
-    L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};             // resolved from set_l2_swimlane_enabled()
-    PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};  // resolved from set_pmu_enabled()
-    std::string output_prefix_{};                                  // diagnostic artifact root directory
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};  // resolved from set_l2_swimlane_enabled()
+    PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};   // resolved from set_pmu_enabled()
+    std::string output_prefix_{};                                   // diagnostic artifact root directory
 
     int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id);
     int init_scope_stats(int num_threads);
diff --git a/src/a5/platform/src/aicpu/l2_perf_collector_aicpu.cpp b/src/a5/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp
similarity index 64%
rename from src/a5/platform/src/aicpu/l2_perf_collector_aicpu.cpp
rename to src/a5/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp
index fdf16986c..877383642 100644
--- a/src/a5/platform/src/aicpu/l2_perf_collector_aicpu.cpp
+++ b/src/a5/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp
@@ -10,15 +10,15 @@
  */
 
 /**
- * @file l2_perf_collector_aicpu.cpp
+ * @file l2_swimlane_collector_aicpu.cpp
  * @brief AICPU performance data collection implementation (SPSC free queue)
  *
- * Uses per-core L2PerfBufferState with SPSC free queues for O(1) buffer switching.
+ * Uses per-core L2SwimlaneAicpuTaskPool with SPSC free queues for O(1) buffer switching.
  * Host memory manager dynamically allocates replacement buffers and pushes
  * them into the free_queue. Device pops from free_queue when switching.
  */
 
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 
 #include <cinttypes>
 #include <cstring>
@@ -29,58 +29,60 @@
 #include "common/unified_log.h"
 
 // Cached pointers for hot-path access (set during init)
-static AicpuPhaseHeader *s_phase_header = nullptr;
-static L2PerfDataHeader *s_l2_perf_header = nullptr;
+static L2SwimlaneAicpuPhaseHeader *s_l2_swimlane_aicpu_phase_header = nullptr;
+static L2SwimlaneDataHeader *s_l2_swimlane_header = nullptr;
 
-// Per-core L2PerfBufferState cache
-static L2PerfBufferState *s_perf_buffer_states[PLATFORM_MAX_CORES] = {};
+// Per-core L2SwimlaneAicpuTaskPool cache
+static L2SwimlaneAicpuTaskPool *s_aicpu_task_pools[PLATFORM_MAX_CORES] = {};
 
-// Per-core L2PerfAicoreRing cache (stable for the run; AICPU reads, AICore writes)
-static L2PerfAicoreRing *s_perf_aicore_rings[PLATFORM_MAX_CORES] = {};
+// Per-core L2SwimlaneAicoreRing cache (stable for the run; AICPU reads, AICore writes)
+static L2SwimlaneAicoreRing *s_perf_aicore_rings[PLATFORM_MAX_CORES] = {};
 
 // Per-core cached current-records-buffer pointer. Written by AICPU when
 // rotating buffers from inside `complete_record`; AICPU never publishes this
 // to AICore (AICore only sees the stable ring).
-static L2PerfBuffer *s_perf_records_buffers[PLATFORM_MAX_CORES] = {};
+static L2SwimlaneAicpuTaskBuffer *s_current_aicpu_task_buffers[PLATFORM_MAX_CORES] = {};
 
-// Per-thread PhaseBufferState cache
-static PhaseBufferState *s_phase_buffer_states[PLATFORM_MAX_AICPU_THREADS] = {};
-static PhaseBuffer *s_current_phase_buf[PLATFORM_MAX_AICPU_THREADS] = {};
+// Per-thread L2SwimlaneAicpuPhasePool cache
+static L2SwimlaneAicpuPhasePool *s_aicpu_phase_pools[PLATFORM_MAX_AICPU_THREADS] = {};
+static L2SwimlaneAicpuPhaseBuffer *s_current_aicpu_phase_buffers[PLATFORM_MAX_AICPU_THREADS] = {};
 
 static int s_orch_thread_idx = -1;
 
-// L2 perf platform state. Published by the host (via dlsym'd setters on sim)
+// L2 swimlane platform state. Published by the host (via dlsym'd setters on sim)
 // or by the AICPU kernel entry (onboard) before perf init runs, so downstream
 // perf code can discover enablement + device-base without reading the generic
 // Runtime struct. Two channels (mirrors PMU):
 //   - g_enable_l2_swimlane (bool) — set at kernel entry from the bitmask bit
-//   - g_l2_perf_level (L2PerfLevel) — promoted in l2_perf_aicpu_init from the
+//   - g_l2_swimlane_level (L2SwimlaneLevel) — promoted in l2_swimlane_aicpu_init from the
 //     shared-memory header so `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES`
-//     gates have the granular value (exposed via get_l2_perf_level()).
-static uint64_t g_platform_l2_perf_base = 0;
+//     gates have the granular value (exposed via get_l2_swimlane_level()).
+static uint64_t g_platform_l2_swimlane_base = 0;
 static bool g_enable_l2_swimlane = false;
-static L2PerfLevel g_l2_perf_level = L2PerfLevel::DISABLED;
+static L2SwimlaneLevel g_l2_swimlane_level = L2SwimlaneLevel::DISABLED;
 
-extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base) { g_platform_l2_perf_base = l2_perf_data_base; }
-extern "C" uint64_t get_platform_l2_perf_base() { return g_platform_l2_perf_base; }
+extern "C" void set_platform_l2_swimlane_base(uint64_t l2_swimlane_data_base) {
+    g_platform_l2_swimlane_base = l2_swimlane_data_base;
+}
+extern "C" uint64_t get_platform_l2_swimlane_base() { return g_platform_l2_swimlane_base; }
 extern "C" void set_l2_swimlane_enabled(bool enable) { g_enable_l2_swimlane = enable; }
 extern "C" bool is_l2_swimlane_enabled() { return g_enable_l2_swimlane; }
-L2PerfLevel get_l2_perf_level() { return g_l2_perf_level; }
+L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; }
 
 /**
  * Enqueue ready buffer to per-thread queue
  *
- * @param header L2PerfDataHeader pointer
+ * @param header L2SwimlaneDataHeader pointer
  * @param thread_idx Thread index
  * @param core_index Core index (or thread_idx for phase entries)
  * @param buffer_ptr Device pointer to the full buffer
  * @param buffer_seq Sequence number for ordering
- * @param is_phase 0 = L2PerfRecord, 1 = Phase
+ * @param kind Buffer kind discriminator (see L2SwimlaneBufferKind)
  * @return 0 on success, -1 if queue full
  */
 static int enqueue_ready_buffer(
-    L2PerfDataHeader *header, int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq,
-    uint32_t is_phase
+    L2SwimlaneDataHeader *header, int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq,
+    L2SwimlaneBufferKind kind
 ) {
     uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE;
     uint32_t current_tail = header->queue_tails[thread_idx];
@@ -93,7 +95,7 @@ static int enqueue_ready_buffer(
     }
 
     header->queues[thread_idx][current_tail].core_index = core_index;
-    header->queues[thread_idx][current_tail].is_phase = is_phase;
+    header->queues[thread_idx][current_tail].kind = kind;
     header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr;
     header->queues[thread_idx][current_tail].buffer_seq = buffer_seq;
     header->queue_tails[thread_idx] = next_tail;
@@ -101,33 +103,33 @@ static int enqueue_ready_buffer(
     return 0;
 }
 
-void l2_perf_aicpu_init(int worker_count) {
-    void *l2_perf_base = reinterpret_cast<void *>(g_platform_l2_perf_base);
-    if (l2_perf_base == nullptr) {
-        LOG_ERROR("l2_perf_data_base is NULL, cannot initialize profiling");
+void l2_swimlane_aicpu_init(int worker_count) {
+    void *l2_swimlane_base = reinterpret_cast<void *>(g_platform_l2_swimlane_base);
+    if (l2_swimlane_base == nullptr) {
+        LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize profiling");
         return;
     }
 
-    s_l2_perf_header = get_l2_perf_header(l2_perf_base);
+    s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base);
 
     // Read the granular perf_level from the shared-memory header (host wrote
-    // it in L2PerfCollector::initialize). The kernel-entry setter only seeded
+    // it in L2SwimlaneCollector::initialize). The kernel-entry setter only seeded
     // the binary g_enable_l2_swimlane via the bitmask bit.
-    g_l2_perf_level = static_cast<L2PerfLevel>(s_l2_perf_header->l2_perf_level);
+    g_l2_swimlane_level = static_cast<L2SwimlaneLevel>(s_l2_swimlane_header->l2_swimlane_level);
 
     LOG_INFO_V0(
-        "Initializing performance profiling for %d cores (memcpy-based), l2_perf_level=%u", worker_count,
-        static_cast<uint32_t>(g_l2_perf_level)
+        "Initializing performance profiling for %d cores (memcpy-based), l2_swimlane_level=%u", worker_count,
+        static_cast<uint32_t>(g_l2_swimlane_level)
     );
 
     // Pop first buffer from free_queue for each core, and cache the stable
     // AICore staging ring pointer so complete_record can read it without
     // touching SHM.
     for (int i = 0; i < worker_count; i++) {
-        L2PerfBufferState *state = get_perf_buffer_state(l2_perf_base, i);
+        L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(l2_swimlane_base, i);
 
-        s_perf_buffer_states[i] = state;
-        s_perf_aicore_rings[i] = reinterpret_cast<L2PerfAicoreRing *>(state->aicore_ring_ptr);
+        s_aicpu_task_pools[i] = state;
+        s_perf_aicore_rings[i] = reinterpret_cast<L2SwimlaneAicoreRing *>(state->aicore_ring_ptr);
 
         // Pop first buffer from free_queue
         rmb();
@@ -142,15 +144,15 @@ void l2_perf_aicpu_init(int worker_count) {
             state->current_buf_seq = 0;
             wmb();
 
-            L2PerfBuffer *buf = reinterpret_cast<L2PerfBuffer *>(buf_ptr);
+            L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(buf_ptr);
             buf->count = 0;
-            s_perf_records_buffers[i] = buf;
+            s_current_aicpu_task_buffers[i] = buf;
 
             LOG_DEBUG("Core %d: popped initial buffer (addr=0x%lx)", i, buf_ptr);
         } else {
             LOG_ERROR("Core %d: free_queue is empty during init!", i);
             state->current_buf_ptr = 0;
-            s_perf_records_buffers[i] = nullptr;
+            s_current_aicpu_task_buffers[i] = nullptr;
         }
     }
 
@@ -160,18 +162,18 @@ void l2_perf_aicpu_init(int worker_count) {
 }
 
 /**
- * Internal records-buffer rotation. Called from `l2_perf_aicpu_complete_record`
+ * Internal records-buffer rotation. Called from `l2_swimlane_aicpu_complete_task`
  * after a record is committed and the buffer hits capacity. Only swaps an
  * AICPU-private records pointer — AICore reads from a stable ring and is
  * unaffected by this call.
  */
 static void switch_records_buffer(int core_id, int thread_idx) {
-    L2PerfBufferState *state = s_perf_buffer_states[core_id];
+    L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id];
     if (state == nullptr) {
         return;
     }
 
-    L2PerfBuffer *full_buf = s_perf_records_buffers[core_id];
+    L2SwimlaneAicpuTaskBuffer *full_buf = s_current_aicpu_task_buffers[core_id];
     if (full_buf == nullptr) {
         return;
     }
@@ -194,7 +196,9 @@ static void switch_records_buffer(int core_id, int thread_idx) {
 
     // Enqueue full buffer to ReadyQueue
     uint32_t seq = state->current_buf_seq;
-    int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, state->current_buf_ptr, seq, 0);
+    int rc = enqueue_ready_buffer(
+        s_l2_swimlane_header, thread_idx, core_id, state->current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask
+    );
     if (rc != 0) {
         LOG_ERROR("Thread %d: Core %d failed to enqueue buffer (queue full), data lost!", thread_idx, core_id);
         // Revert: discard data and keep writing
@@ -212,25 +216,25 @@ static void switch_records_buffer(int core_id, int thread_idx) {
     state->current_buf_seq = seq + 1;
     wmb();
 
-    L2PerfBuffer *new_buf = reinterpret_cast<L2PerfBuffer *>(new_buf_ptr);
+    L2SwimlaneAicpuTaskBuffer *new_buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(new_buf_ptr);
     new_buf->count = 0;
-    s_perf_records_buffers[core_id] = new_buf;
+    s_current_aicpu_task_buffers[core_id] = new_buf;
 
     LOG_INFO_V0("Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr);
 }
 
-int l2_perf_aicpu_complete_record(
+int l2_swimlane_aicpu_complete_task(
     int core_id, int thread_idx, uint32_t expected_reg_task_id, uint64_t task_id, uint32_t func_id, CoreType core_type,
     uint64_t dispatch_time, uint64_t finish_time, const uint64_t *fanout, int32_t fanout_count
 ) {
     if (core_id < 0 || core_id >= PLATFORM_MAX_CORES) {
         return -1;
     }
-    L2PerfBufferState *state = s_perf_buffer_states[core_id];
+    L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id];
     if (state == nullptr) {
         return -1;
     }
-    L2PerfAicoreRing *ring = s_perf_aicore_rings[core_id];
+    L2SwimlaneAicoreRing *ring = s_perf_aicore_rings[core_id];
     if (ring == nullptr) {
         return -1;
     }
@@ -239,14 +243,14 @@ int l2_perf_aicpu_complete_record(
     // `device_total - (collected + dropped + mismatch)`.
     state->total_record_count += 1;
 
-    L2PerfBuffer *l2_perf_buf = s_perf_records_buffers[core_id];
-    if (l2_perf_buf == nullptr) {
+    L2SwimlaneAicpuTaskBuffer *l2_swimlane_buf = s_current_aicpu_task_buffers[core_id];
+    if (l2_swimlane_buf == nullptr) {
         // No active records buffer (init ran out of free buffers); count as drop
         // so host reconciliation stays consistent.
         state->dropped_record_count += 1;
         return -1;
     }
-    uint32_t count = l2_perf_buf->count;
+    uint32_t count = l2_swimlane_buf->count;
     if (count >= PLATFORM_PROF_BUFFER_SIZE) {
         // Defensive: should not happen because we rotate at end of every commit.
         state->dropped_record_count += 1;
@@ -254,8 +258,8 @@ int l2_perf_aicpu_complete_record(
     }
 
     // Read AICore-published timing from the per-core staging ring.
-    L2PerfRecord *slot = &ring->dual_issue_slots[expected_reg_task_id % PLATFORM_L2_AICORE_RING_SIZE];
-    // One PoC cache line: matches AICore l2_perf_aicore_record_task() dcci(..., SINGLE_CACHE_LINE, ...)
+    L2SwimlaneAicpuTaskRecord *slot = &ring->dual_issue_slots[expected_reg_task_id % PLATFORM_L2_AICORE_RING_SIZE];
+    // One PoC cache line: matches AICore l2_swimlane_aicore_record_task() dcci(..., SINGLE_CACHE_LINE, ...)
     // and aicpu/cache_ops.cpp step size; timing fields live in the first line.
     cache_invalidate_range(slot, 64);
     if (static_cast<uint32_t>(slot->task_id) != expected_reg_task_id) {
@@ -266,7 +270,7 @@ int l2_perf_aicpu_complete_record(
         // dcci before signaling). Surface separately from capacity drops.
         state->mismatch_record_count += 1;
         LOG_ERROR(
-            "L2Perf invariant violated: core %d slot task_id=0x%x expected=0x%x "
+            "L2Swimlane invariant violated: core %d slot task_id=0x%x expected=0x%x "
             "(completion-before-dispatch broken or ring undersized)",
             core_id, static_cast<uint32_t>(slot->task_id), expected_reg_task_id
         );
@@ -274,7 +278,7 @@ int l2_perf_aicpu_complete_record(
     }
 
     // Copy AICore timing to committed record slot
-    L2PerfRecord *record = &l2_perf_buf->records[count];
+    L2SwimlaneAicpuTaskRecord *record = &l2_swimlane_buf->records[count];
     record->start_time = slot->start_time;
     record->end_time = slot->end_time;
 
@@ -284,7 +288,7 @@ int l2_perf_aicpu_complete_record(
     record->core_type = core_type;
 
     // AICPU_TIMING and above: dispatch/finish timing and fanout dependency info
-    if (g_l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+    if (g_l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
         record->dispatch_time = dispatch_time;
         record->finish_time = finish_time;
         if (fanout != nullptr && fanout_count > 0) {
@@ -303,7 +307,7 @@ int l2_perf_aicpu_complete_record(
     }
 
     uint32_t new_count = count + 1;
-    l2_perf_buf->count = new_count;
+    l2_swimlane_buf->count = new_count;
     wmb();
 
     // Rotate after the write so the just-committed record is preserved.
@@ -315,7 +319,7 @@ int l2_perf_aicpu_complete_record(
     return 0;
 }
 
-void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, int core_num) {
+void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int core_num) {
     if (!is_l2_swimlane_enabled()) {
         return;
     }
@@ -328,7 +332,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
 
     for (int i = 0; i < core_num; i++) {
         int core_id = cur_thread_cores[i];
-        L2PerfBufferState *state = s_perf_buffer_states[core_id];
+        L2SwimlaneAicpuTaskPool *state = s_aicpu_task_pools[core_id];
         if (state == nullptr) continue;
 
         rmb();
@@ -338,18 +342,20 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
             continue;
         }
 
-        L2PerfBuffer *buf = reinterpret_cast<L2PerfBuffer *>(buf_ptr);
+        L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(buf_ptr);
         if (buf->count == 0) {
             continue;
         }
 
         uint32_t seq = state->current_buf_seq;
-        int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, core_id, buf_ptr, seq, 0);
+        int rc = enqueue_ready_buffer(
+            s_l2_swimlane_header, thread_idx, core_id, buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask
+        );
         if (rc == 0) {
             LOG_INFO_V0("Thread %d: Core %d flushed buffer with %u records", thread_idx, core_id, buf->count);
             flushed_count++;
             state->current_buf_ptr = 0;
-            s_perf_records_buffers[core_id] = nullptr;
+            s_current_aicpu_task_buffers[core_id] = nullptr;
             wmb();
         } else {
             // ready_queue full at end-of-run: account the loss and clear the
@@ -362,7 +368,7 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
             state->dropped_record_count += buf->count;
             buf->count = 0;
             state->current_buf_ptr = 0;
-            s_perf_records_buffers[core_id] = nullptr;
+            s_current_aicpu_task_buffers[core_id] = nullptr;
             wmb();
         }
     }
@@ -372,22 +378,24 @@ void l2_perf_aicpu_flush_buffers(int thread_idx, const int *cur_thread_cores, in
     LOG_INFO_V0("Thread %d: Performance buffer flush complete, %d buffers flushed", thread_idx, flushed_count);
 }
 
-void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) {
-    void *l2_perf_base = reinterpret_cast<void *>(g_platform_l2_perf_base);
-    if (l2_perf_base == nullptr) {
-        LOG_ERROR("l2_perf_data_base is NULL, cannot initialize phase profiling");
+void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads) {
+    void *l2_swimlane_base = reinterpret_cast<void *>(g_platform_l2_swimlane_base);
+    if (l2_swimlane_base == nullptr) {
+        LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize phase profiling");
         return;
     }
 
-    s_phase_header = get_phase_header(l2_perf_base, worker_count);
-    s_l2_perf_header = get_l2_perf_header(l2_perf_base);
+    s_l2_swimlane_aicpu_phase_header = get_phase_header(l2_swimlane_base, worker_count);
+    s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base);
 
-    s_phase_header->magic = AICPU_PHASE_MAGIC;
-    s_phase_header->num_sched_threads = num_sched_threads;
-    s_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD;
-    s_phase_header->num_cores = 0;
+    s_l2_swimlane_aicpu_phase_header->magic = L2_SWIMLANE_AICPU_PHASE_MAGIC;
+    s_l2_swimlane_aicpu_phase_header->num_sched_threads = num_sched_threads;
+    s_l2_swimlane_aicpu_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD;
+    s_l2_swimlane_aicpu_phase_header->num_cores = 0;
 
-    memset(s_phase_header->core_to_thread, -1, sizeof(s_phase_header->core_to_thread));
+    memset(
+        s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread)
+    );
 
     // Cache per-thread record pointers and clear buffers
     // Include all threads: scheduler + orchestrator (orchestrators may become schedulers)
@@ -396,9 +404,9 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) {
         total_threads = PLATFORM_MAX_AICPU_THREADS;
     }
     for (int t = 0; t < total_threads; t++) {
-        PhaseBufferState *state = get_phase_buffer_state(l2_perf_base, worker_count, t);
+        L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(l2_swimlane_base, worker_count, t);
 
-        s_phase_buffer_states[t] = state;
+        s_aicpu_phase_pools[t] = state;
 
         // Pop first buffer from free_queue
         rmb();
@@ -413,22 +421,22 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) {
             state->current_buf_seq = 0;
             wmb();
 
-            PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(buf_ptr);
+            L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(buf_ptr);
             buf->count = 0;
-            s_current_phase_buf[t] = buf;
+            s_current_aicpu_phase_buffers[t] = buf;
 
             LOG_DEBUG("Thread %d: popped initial phase buffer (addr=0x%lx)", t, buf_ptr);
         } else {
             LOG_ERROR("Thread %d: phase free_queue is empty during init!", t);
             state->current_buf_ptr = 0;
-            s_current_phase_buf[t] = nullptr;
+            s_current_aicpu_phase_buffers[t] = nullptr;
         }
     }
 
     // Clear remaining slots
     for (int t = total_threads; t < PLATFORM_MAX_AICPU_THREADS; t++) {
-        s_phase_buffer_states[t] = nullptr;
-        s_current_phase_buf[t] = nullptr;
+        s_aicpu_phase_pools[t] = nullptr;
+        s_current_aicpu_phase_buffers[t] = nullptr;
     }
 
     wmb();
@@ -443,21 +451,23 @@ void l2_perf_aicpu_init_phase(int worker_count, int num_sched_threads) {
  * Switch phase buffer when current buffer is full (free queue version)
  *
  * Enqueues the full buffer to ReadyQueue and pops the next buffer from free_queue.
- * If no free buffer is available, sets s_current_phase_buf to nullptr so subsequent
+ * If no free buffer is available, sets s_current_aicpu_phase_buffers to nullptr so subsequent
  * records are dropped (preserving already-enqueued data).
  */
 static void switch_phase_buffer(int thread_idx) {
-    PhaseBufferState *state = s_phase_buffer_states[thread_idx];
+    L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx];
     if (state == nullptr) return;
 
-    PhaseBuffer *full_buf = s_current_phase_buf[thread_idx];
+    L2SwimlaneAicpuPhaseBuffer *full_buf = s_current_aicpu_phase_buffers[thread_idx];
     if (full_buf == nullptr) return;
 
     LOG_INFO_V0("Thread %d: phase buffer is full (count=%u)", thread_idx, full_buf->count);
 
     // Enqueue to ReadyQueue
     uint32_t seq = state->current_buf_seq;
-    int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, thread_idx, state->current_buf_ptr, seq, 1);
+    int rc = enqueue_ready_buffer(
+        s_l2_swimlane_header, thread_idx, thread_idx, state->current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuPhase
+    );
     if (rc != 0) {
         LOG_ERROR("Thread %d: failed to enqueue phase buffer (queue full), discarding data", thread_idx);
         // Treat the entire un-enqueued buffer as dropped to keep the
@@ -482,29 +492,29 @@ static void switch_phase_buffer(int thread_idx) {
         state->current_buf_seq = seq + 1;
         wmb();
 
-        PhaseBuffer *new_buf = reinterpret_cast<PhaseBuffer *>(new_buf_ptr);
+        L2SwimlaneAicpuPhaseBuffer *new_buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(new_buf_ptr);
         new_buf->count = 0;
-        s_current_phase_buf[thread_idx] = new_buf;
+        s_current_aicpu_phase_buffers[thread_idx] = new_buf;
 
         LOG_INFO_V0("Thread %d: switched to new phase buffer", thread_idx);
     } else {
         // No free buffer available, drop subsequent records
         LOG_WARN("Thread %d: no free phase buffer available, dropping records until Host catches up", thread_idx);
-        s_current_phase_buf[thread_idx] = nullptr;
+        s_current_aicpu_phase_buffers[thread_idx] = nullptr;
         state->current_buf_ptr = 0;
         wmb();
     }
 }
 
-void l2_perf_aicpu_record_phase(
-    int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
+void l2_swimlane_aicpu_record_phase(
+    int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
     uint64_t tasks_processed, uint32_t extra1, uint32_t extra2
 ) {
-    if (s_phase_header == nullptr) {
+    if (s_l2_swimlane_aicpu_phase_header == nullptr) {
         return;
     }
 
-    PhaseBufferState *state = s_phase_buffer_states[thread_idx];
+    L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx];
     if (state == nullptr) {
         return;
     }
@@ -513,7 +523,7 @@ void l2_perf_aicpu_record_phase(
     // as `device_total - (collected + dropped)` (mirrors PERF accounting).
     state->total_record_count += 1;
 
-    PhaseBuffer *buf = s_current_phase_buf[thread_idx];
+    L2SwimlaneAicpuPhaseBuffer *buf = s_current_aicpu_phase_buffers[thread_idx];
 
     // Try to recover from nullptr (no buffer was available on previous switch)
     if (buf == nullptr) {
@@ -529,9 +539,9 @@ void l2_perf_aicpu_record_phase(
             state->current_buf_seq = state->current_buf_seq + 1;
             wmb();
 
-            buf = reinterpret_cast<PhaseBuffer *>(buf_ptr);
+            buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(buf_ptr);
             buf->count = 0;
-            s_current_phase_buf[thread_idx] = buf;
+            s_current_aicpu_phase_buffers[thread_idx] = buf;
 
             LOG_INFO_V0("Thread %d: recovered phase buffer", thread_idx);
         }
@@ -546,7 +556,7 @@ void l2_perf_aicpu_record_phase(
     if (idx >= PLATFORM_PHASE_RECORDS_PER_THREAD) {
         // Buffer full, switch to next buffer
         switch_phase_buffer(thread_idx);
-        buf = s_current_phase_buf[thread_idx];
+        buf = s_current_aicpu_phase_buffers[thread_idx];
         if (buf == nullptr) {
             state->dropped_record_count += 1;
             return;
@@ -558,7 +568,7 @@ void l2_perf_aicpu_record_phase(
         }
     }
 
-    AicpuPhaseRecord *record = &buf->records[idx];
+    L2SwimlaneAicpuPhaseRecord *record = &buf->records[idx];
     record->start_time = start_time;
     record->end_time = end_time;
     record->loop_iter = loop_iter;
@@ -570,21 +580,21 @@ void l2_perf_aicpu_record_phase(
     buf->count = idx + 1;
 }
 
-void l2_perf_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thread_idx; }
+void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thread_idx; }
 
-void l2_perf_aicpu_record_orch_phase(
-    AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
+void l2_swimlane_aicpu_record_orch_phase(
+    L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
 ) {
-    if (s_orch_thread_idx < 0 || s_phase_header == nullptr) return;
-    l2_perf_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id);
+    if (s_orch_thread_idx < 0 || s_l2_swimlane_aicpu_phase_header == nullptr) return;
+    l2_swimlane_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id);
 }
 
-void l2_perf_aicpu_flush_phase_buffers(int thread_idx) {
-    if (s_phase_header == nullptr || s_l2_perf_header == nullptr) {
+void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) {
+    if (s_l2_swimlane_aicpu_phase_header == nullptr || s_l2_swimlane_header == nullptr) {
         return;
     }
 
-    PhaseBufferState *state = s_phase_buffer_states[thread_idx];
+    L2SwimlaneAicpuPhasePool *state = s_aicpu_phase_pools[thread_idx];
     if (state == nullptr) return;
 
     rmb();
@@ -594,13 +604,15 @@ void l2_perf_aicpu_flush_phase_buffers(int thread_idx) {
         return;
     }
 
-    PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(buf_ptr);
+    L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(buf_ptr);
     if (buf->count == 0) {
         return;
     }
 
     uint32_t seq = state->current_buf_seq;
-    int rc = enqueue_ready_buffer(s_l2_perf_header, thread_idx, thread_idx, buf_ptr, seq, 1);
+    int rc = enqueue_ready_buffer(
+        s_l2_swimlane_header, thread_idx, thread_idx, buf_ptr, seq, L2SwimlaneBufferKind::AicpuPhase
+    );
     if (rc == 0) {
         LOG_INFO_V0("Thread %d: flushed phase buffer with %u records", thread_idx, buf->count);
     } else {
@@ -609,28 +621,30 @@ void l2_perf_aicpu_flush_phase_buffers(int thread_idx) {
         buf->count = 0;
     }
     state->current_buf_ptr = 0;
-    s_current_phase_buf[thread_idx] = nullptr;
+    s_current_aicpu_phase_buffers[thread_idx] = nullptr;
     wmb();
 }
 
-void l2_perf_aicpu_init_core_assignments(int total_cores) {
-    if (s_phase_header == nullptr) {
+void l2_swimlane_aicpu_init_core_assignments(int total_cores) {
+    if (s_l2_swimlane_aicpu_phase_header == nullptr) {
         return;
     }
-    memset(s_phase_header->core_to_thread, -1, sizeof(s_phase_header->core_to_thread));
-    s_phase_header->num_cores = static_cast<uint32_t>(total_cores);
+    memset(
+        s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread)
+    );
+    s_l2_swimlane_aicpu_phase_header->num_cores = static_cast<uint32_t>(total_cores);
     wmb();
     LOG_INFO_V0("Core-to-thread mapping init: %d cores", total_cores);
 }
 
-void l2_perf_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) {
-    if (s_phase_header == nullptr) {
+void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) {
+    if (s_l2_swimlane_aicpu_phase_header == nullptr) {
         return;
     }
     for (int i = 0; i < core_num; i++) {
         int core_id = core_ids[i];
         if (core_id >= 0 && core_id < PLATFORM_MAX_CORES) {
-            s_phase_header->core_to_thread[core_id] = static_cast<int8_t>(thread_idx);
+            s_l2_swimlane_aicpu_phase_header->core_to_thread[core_id] = static_cast<int8_t>(thread_idx);
         }
     }
     wmb();
diff --git a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp
index 33c04f783..15eb4405a 100644
--- a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp
+++ b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp
@@ -13,7 +13,7 @@
  * @file tensor_dump_aicpu.cpp
  * @brief AICPU tensor dump collection implementation
  *
- * Mirrors l2_perf_collector_aicpu.cpp patterns:
+ * Mirrors l2_swimlane_collector_aicpu.cpp patterns:
  * - Per-thread DumpBufferState with SPSC free queues
  * - Per-thread ready queue for handing off full metadata buffers
  * - Per-thread circular arena for tensor payload data
diff --git a/src/a5/platform/src/host/l2_perf_collector.cpp b/src/a5/platform/src/host/l2_swimlane_collector.cpp
similarity index 80%
rename from src/a5/platform/src/host/l2_perf_collector.cpp
rename to src/a5/platform/src/host/l2_swimlane_collector.cpp
index 028d374a6..e57c8ed8e 100644
--- a/src/a5/platform/src/host/l2_perf_collector.cpp
+++ b/src/a5/platform/src/host/l2_swimlane_collector.cpp
@@ -10,21 +10,21 @@
  */
 
 /**
- * @file l2_perf_collector.cpp
+ * @file l2_swimlane_collector.cpp
  * @brief Performance data collector implementation. The mgmt-thread +
  *        buffer-pool machinery lives in profiling_common::BufferPoolManager
- *        parameterized by L2PerfModule (host/l2_perf_collector.h); the
+ *        parameterized by L2SwimlaneModule (host/l2_swimlane_collector.h); the
  *        poll loop lives in profiling_common::ProfilerBase. This file
  *        owns the per-buffer on_buffer_collected callback and the export
  *        logic.
  *
  * a5 specifics: device↔host transfers go through profiling_copy.h. The
  * framework's mgmt loop mirrors the shm region per tick; per-buffer
- * payloads (L2PerfBuffer / PhaseBuffer) are pulled on demand inside
+ * payloads (L2SwimlaneAicpuTaskBuffer / L2SwimlaneAicpuPhaseBuffer) are pulled on demand inside
  * ProfilerAlgorithms.
  */
 
-#include "host/l2_perf_collector.h"
+#include "host/l2_swimlane_collector.h"
 
 #include <algorithm>
 #include <chrono>
@@ -43,7 +43,7 @@
 #include "host/profiling_copy.h"
 
 // =============================================================================
-// L2PerfCollector Implementation
+// L2SwimlaneCollector Implementation
 // =============================================================================
 
 /**
@@ -51,18 +51,18 @@
  * Scheduler phases: SCHED_COMPLETE(0), SCHED_DISPATCH(1), SCHED_SCAN(2), SCHED_IDLE_WAIT(3)
  * Orchestrator phases: ORCH_SYNC(16) through ORCH_SCOPE_END(24)
  */
-static bool is_scheduler_phase(AicpuPhaseId id) {
-    return static_cast<uint32_t>(id) < static_cast<uint32_t>(AicpuPhaseId::SCHED_PHASE_COUNT);
+static bool is_scheduler_phase(L2SwimlaneAicpuPhaseId id) {
+    return static_cast<uint32_t>(id) < static_cast<uint32_t>(L2SwimlaneAicpuPhaseId::SCHED_PHASE_COUNT);
 }
 
-L2PerfCollector::~L2PerfCollector() {
+L2SwimlaneCollector::~L2SwimlaneCollector() {
     stop();
     if (shm_host_ != nullptr) {
-        LOG_WARN("L2PerfCollector destroyed without finalize()");
+        LOG_WARN("L2SwimlaneCollector destroyed without finalize()");
     }
 }
 
-void *L2PerfCollector::alloc_single_buffer(size_t size, void **host_ptr_out) {
+void *L2SwimlaneCollector::alloc_single_buffer(size_t size, void **host_ptr_out) {
     void *dev_ptr = alloc_cb_(size);
     if (dev_ptr == nullptr) {
         LOG_ERROR("Failed to allocate buffer (%zu bytes)", size);
@@ -98,12 +98,12 @@ void *L2PerfCollector::alloc_single_buffer(size_t size, void **host_ptr_out) {
     return dev_ptr;
 }
 
-int L2PerfCollector::initialize(
-    int num_aicore, int device_id, L2PerfLevel l2_perf_level, const L2PerfAllocCallback &alloc_cb,
-    L2PerfRegisterCallback register_cb, const L2PerfFreeCallback &free_cb, const std::string &output_prefix
+int L2SwimlaneCollector::initialize(
+    int num_aicore, int device_id, L2SwimlaneLevel l2_swimlane_level, const L2SwimlaneAllocCallback &alloc_cb,
+    L2SwimlaneRegisterCallback register_cb, const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix
 ) {
     if (shm_host_ != nullptr) {
-        LOG_ERROR("L2PerfCollector already initialized");
+        LOG_ERROR("L2SwimlaneCollector already initialized");
         return -1;
     }
 
@@ -115,7 +115,7 @@ int L2PerfCollector::initialize(
     }
 
     num_aicore_ = num_aicore;
-    l2_perf_level_ = l2_perf_level;
+    l2_swimlane_level_ = l2_swimlane_level;
     output_prefix_ = output_prefix;
     total_perf_collected_ = 0;
     total_phase_collected_ = 0;
@@ -135,9 +135,9 @@ int L2PerfCollector::initialize(
 
     LOG_DEBUG("Shared memory allocation plan:");
     LOG_DEBUG("  Number of cores:        %d", num_aicore);
-    LOG_DEBUG("  Header size:            %zu bytes", sizeof(L2PerfDataHeader));
-    LOG_DEBUG("  L2PerfBufferState size: %zu bytes each", sizeof(L2PerfBufferState));
-    LOG_DEBUG("  PhaseBufferState size:  %zu bytes each", sizeof(PhaseBufferState));
+    LOG_DEBUG("  Header size:            %zu bytes", sizeof(L2SwimlaneDataHeader));
+    LOG_DEBUG("  L2SwimlaneAicpuTaskPool size: %zu bytes each", sizeof(L2SwimlaneAicpuTaskPool));
+    LOG_DEBUG("  L2SwimlaneAicpuPhasePool size:  %zu bytes each", sizeof(L2SwimlaneAicpuPhasePool));
     LOG_DEBUG("  Total shared memory:    %zu bytes (%zu KB)", total_size, total_size / 1024);
 
     // Step 2: Allocate shared memory + paired host shadow
@@ -151,21 +151,21 @@ int L2PerfCollector::initialize(
 
     // Step 3: Initialize header on host shadow
     std::memset(perf_host_ptr, 0, total_size);
-    L2PerfDataHeader *header = get_l2_perf_header(perf_host_ptr);
+    L2SwimlaneDataHeader *header = get_l2_swimlane_header(perf_host_ptr);
     for (int t = 0; t < PLATFORM_MAX_AICPU_THREADS; t++) {
         header->queue_heads[t] = 0;
         header->queue_tails[t] = 0;
     }
     header->num_cores = num_aicore;
-    header->l2_perf_level = static_cast<uint32_t>(l2_perf_level_);
+    header->l2_swimlane_level = static_cast<uint32_t>(l2_swimlane_level_);
 
-    LOG_DEBUG("Initialized L2PerfDataHeader:");
+    LOG_DEBUG("Initialized L2SwimlaneDataHeader:");
     LOG_DEBUG("  num_cores:              %d", header->num_cores);
-    LOG_DEBUG("  l2_perf_level:          %u", header->l2_perf_level);
+    LOG_DEBUG("  l2_swimlane_level:          %u", header->l2_swimlane_level);
     LOG_DEBUG("  buffer_capacity:        %d", PLATFORM_PROF_BUFFER_SIZE);
     LOG_DEBUG("  queue capacity:         %d", PLATFORM_PROF_READYQUEUE_SIZE);
 
-    // Step 4: Allocate per-core stable L2PerfAicoreRings + the address-table
+    // Step 4: Allocate per-core stable L2SwimlaneAicoreRings + the address-table
     // buffer. Rings are allocated once and never rotated; AICore writes into
     // them at task time, AICPU reads at FIN time. The address-table mirrors
     // each ring's device pointer so the AICore-side `KernelArgs` machinery
@@ -175,23 +175,23 @@ int L2PerfCollector::initialize(
     size_t table_size = static_cast<size_t>(num_aicore) * sizeof(uint64_t);
     void *table_dev_ptr = alloc_single_buffer(table_size, &table_host_ptr);
     if (table_dev_ptr == nullptr) {
-        LOG_ERROR("Failed to allocate L2Perf aicore ring address table (%zu bytes)", table_size);
+        LOG_ERROR("Failed to allocate L2Swimlane aicore ring address table (%zu bytes)", table_size);
         return -1;
     }
     std::memset(table_host_ptr, 0, table_size);
     aicore_ring_addrs_dev_ = table_dev_ptr;
     aicore_ring_addrs_host_ = table_host_ptr;
 
-    // Step 4b: Initialize L2PerfBufferStates — 1 buffer/core in free_queue, rest to recycled pool.
+    // Step 4b: Initialize L2SwimlaneAicpuTaskPools — 1 buffer/core in free_queue, rest to recycled pool.
     for (int i = 0; i < num_aicore; i++) {
-        L2PerfBufferState *state = get_perf_buffer_state(perf_host_ptr, i);
-        std::memset(state, 0, sizeof(L2PerfBufferState));
+        L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(perf_host_ptr, i);
+        std::memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool));
 
         // Allocate the per-core staging ring (no host shadow needed: AICore
         // writes, AICPU reads — host never touches the ring directly).
-        void *ring_dev = alloc_cb(sizeof(L2PerfAicoreRing));
+        void *ring_dev = alloc_cb(sizeof(L2SwimlaneAicoreRing));
         if (ring_dev == nullptr) {
-            LOG_ERROR("Failed to allocate L2PerfAicoreRing for core %d", i);
+            LOG_ERROR("Failed to allocate L2SwimlaneAicoreRing for core %d", i);
             return -1;
         }
         aicore_rings_dev_[i] = ring_dev;
@@ -200,22 +200,22 @@ int L2PerfCollector::initialize(
 
         for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_CORE; s++) {
             void *host_buf_ptr = nullptr;
-            void *dev_buf_ptr = alloc_single_buffer(sizeof(L2PerfBuffer), &host_buf_ptr);
+            void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuTaskBuffer), &host_buf_ptr);
             if (dev_buf_ptr == nullptr) {
-                LOG_ERROR("Failed to allocate L2PerfBuffer for core %d, buffer %d", i, s);
+                LOG_ERROR("Failed to allocate L2SwimlaneAicpuTaskBuffer for core %d, buffer %d", i, s);
                 return -1;
             }
 
             if (s == 0) {
                 state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
             } else {
-                manager_.push_recycled(static_cast<int>(ProfBufferType::PERF_RECORD), dev_buf_ptr);
+                manager_.push_recycled(static_cast<int>(ProfBufferType::AICPU_TASK), dev_buf_ptr);
             }
         }
         state->free_queue.tail = 1;
     }
     LOG_DEBUG(
-        "Initialized %d L2PerfBufferStates: 1 buffer/core, %d in recycled pool", num_aicore,
+        "Initialized %d L2SwimlaneAicpuTaskPools: 1 buffer/core, %d in recycled pool", num_aicore,
         num_aicore * (PLATFORM_PROF_BUFFERS_PER_CORE - 1)
     );
 
@@ -224,21 +224,21 @@ int L2PerfCollector::initialize(
 
     // Step 5: Initialize PhaseBufferStates — 1 buffer/thread in free_queue, rest to recycled pool.
     for (int t = 0; t < num_phase_threads; t++) {
-        PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t);
-        std::memset(state, 0, sizeof(PhaseBufferState));
+        L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t);
+        std::memset(state, 0, sizeof(L2SwimlaneAicpuPhasePool));
 
         for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) {
             void *host_buf_ptr = nullptr;
-            void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr);
+            void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuPhaseBuffer), &host_buf_ptr);
             if (dev_buf_ptr == nullptr) {
-                LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s);
+                LOG_ERROR("Failed to allocate L2SwimlaneAicpuPhaseBuffer for thread %d, buffer %d", t, s);
                 return -1;
             }
 
             if (s == 0) {
                 state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
             } else {
-                manager_.push_recycled(static_cast<int>(ProfBufferType::PHASE), dev_buf_ptr);
+                manager_.push_recycled(static_cast<int>(ProfBufferType::AICPU_PHASE), dev_buf_ptr);
             }
         }
         state->free_queue.tail = 1;
@@ -259,7 +259,7 @@ int L2PerfCollector::initialize(
     collected_perf_records_.assign(num_aicore_, {});
     collected_phase_records_.assign(PLATFORM_MAX_AICPU_THREADS, {});
 
-    LOG_DEBUG("L2 perf device base = 0x%lx", reinterpret_cast<uint64_t>(perf_dev_ptr));
+    LOG_DEBUG("L2 swimlane device base = 0x%lx", reinterpret_cast<uint64_t>(perf_dev_ptr));
     LOG_INFO_V0("Performance profiling initialized (dynamic buffer mode)");
     return 0;
 }
@@ -268,8 +268,8 @@ int L2PerfCollector::initialize(
 // ProfilerBase callbacks
 // ---------------------------------------------------------------------------
 
-void L2PerfCollector::copy_perf_buffer(const ReadyBufferInfo &info) {
-    L2PerfBuffer *buf = reinterpret_cast<L2PerfBuffer *>(info.host_buffer_ptr);
+void L2SwimlaneCollector::copy_perf_buffer(const ReadyBufferInfo &info) {
+    L2SwimlaneAicpuTaskBuffer *buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(info.host_buffer_ptr);
     rmb();
     uint32_t count = buf->count;
     if (count > PLATFORM_PROF_BUFFER_SIZE) {
@@ -284,8 +284,8 @@ void L2PerfCollector::copy_perf_buffer(const ReadyBufferInfo &info) {
     }
 }
 
-void L2PerfCollector::copy_phase_buffer(const ReadyBufferInfo &info) {
-    PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(info.host_buffer_ptr);
+void L2SwimlaneCollector::copy_phase_buffer(const ReadyBufferInfo &info) {
+    L2SwimlaneAicpuPhaseBuffer *buf = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(info.host_buffer_ptr);
     rmb();
     uint32_t count = buf->count;
     if (count > static_cast<uint32_t>(PLATFORM_PHASE_RECORDS_PER_THREAD)) {
@@ -303,8 +303,8 @@ void L2PerfCollector::copy_phase_buffer(const ReadyBufferInfo &info) {
     }
 }
 
-void L2PerfCollector::on_buffer_collected(const ReadyBufferInfo &info) {
-    if (info.type == ProfBufferType::PERF_RECORD) {
+void L2SwimlaneCollector::on_buffer_collected(const ReadyBufferInfo &info) {
+    if (info.type == ProfBufferType::AICPU_TASK) {
         copy_perf_buffer(info);
     } else {
         copy_phase_buffer(info);
@@ -320,13 +320,13 @@ void L2PerfCollector::on_buffer_collected(const ReadyBufferInfo &info) {
 // clear current_buf_ptr on the device side. Host's job here is purely
 // accounting + sanity check.
 //
-// L2PerfBufferState now tracks total / dropped / mismatch counters — same
+// L2SwimlaneAicpuTaskPool now tracks total / dropped / mismatch counters — same
 // three-bucket accounting as PMU and a2a3. The cross-check equation
 // (collected + dropped + mismatch == device_total) is enforced per pool
 // (PERF + PHASE). Empty PHASE pools (runtime emits no phase records) are
 // skipped via the `optional` flag.
 
-void L2PerfCollector::reconcile_counters() {
+void L2SwimlaneCollector::reconcile_counters() {
     if (shm_host_ == nullptr) return;
 
     // Pull the latest BufferStates (current_buf_ptr) before the per-unit
@@ -337,7 +337,7 @@ void L2PerfCollector::reconcile_counters() {
     rmb();
 
     // After stop(), AICPU's per-thread flush hooks
-    // (l2_perf_aicpu_flush_buffers / l2_perf_aicpu_flush_phase_buffers)
+    // (l2_swimlane_aicpu_flush / l2_swimlane_aicpu_flush_phase_buffers)
     // should have either enqueued the active buffer (success →
     // current_buf_ptr=0) or cleared it on enqueue failure. A non-zero
     // pointer with non-zero count means records AICPU neither delivered
@@ -345,16 +345,16 @@ void L2PerfCollector::reconcile_counters() {
     // never written) are fine; AICPU's flush legitimately skips them.
     int leftover_active = 0;
     for (int i = 0; i < num_aicore_; i++) {
-        L2PerfBufferState *state = get_perf_buffer_state(shm_host_, i);
+        L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm_host_, i);
         uint64_t buf_ptr = state->current_buf_ptr;
         if (buf_ptr == 0) continue;
         void *host_ptr = manager_.resolve_host_ptr(reinterpret_cast<void *>(buf_ptr));
         if (host_ptr == nullptr) continue;
-        profiling_copy_from_device(host_ptr, reinterpret_cast<void *>(buf_ptr), sizeof(L2PerfBuffer));
-        uint32_t count = reinterpret_cast<L2PerfBuffer *>(host_ptr)->count;
+        profiling_copy_from_device(host_ptr, reinterpret_cast<void *>(buf_ptr), sizeof(L2SwimlaneAicpuTaskBuffer));
+        uint32_t count = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(host_ptr)->count;
         if (count == 0) continue;
         LOG_ERROR(
-            "L2Perf reconcile: core %d has un-flushed PERF buffer (current_buf_ptr=0x%lx, count=%u) "
+            "L2Swimlane reconcile: core %d has un-flushed PERF buffer (current_buf_ptr=0x%lx, count=%u) "
             "after stop() — device flush failed",
             i, static_cast<unsigned long>(buf_ptr), count
         );
@@ -362,16 +362,16 @@ void L2PerfCollector::reconcile_counters() {
     }
 
     for (int t = 0; t < PLATFORM_MAX_AICPU_THREADS; t++) {
-        PhaseBufferState *state = get_phase_buffer_state(shm_host_, num_aicore_, t);
+        L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm_host_, num_aicore_, t);
         uint64_t buf_ptr = state->current_buf_ptr;
         if (buf_ptr == 0) continue;
         void *host_ptr = manager_.resolve_host_ptr(reinterpret_cast<void *>(buf_ptr));
         if (host_ptr == nullptr) continue;
-        profiling_copy_from_device(host_ptr, reinterpret_cast<void *>(buf_ptr), sizeof(PhaseBuffer));
-        uint32_t count = reinterpret_cast<PhaseBuffer *>(host_ptr)->count;
+        profiling_copy_from_device(host_ptr, reinterpret_cast<void *>(buf_ptr), sizeof(L2SwimlaneAicpuPhaseBuffer));
+        uint32_t count = reinterpret_cast<L2SwimlaneAicpuPhaseBuffer *>(host_ptr)->count;
         if (count == 0) continue;
         LOG_ERROR(
-            "L2Perf reconcile: thread %d has un-flushed PHASE buffer (current_buf_ptr=0x%lx, count=%u) "
+            "L2Swimlane reconcile: thread %d has un-flushed PHASE buffer (current_buf_ptr=0x%lx, count=%u) "
             "after stop() — device flush failed",
             t, static_cast<unsigned long>(buf_ptr), count
         );
@@ -379,7 +379,9 @@ void L2PerfCollector::reconcile_counters() {
     }
 
     if (leftover_active > 0) {
-        LOG_ERROR("L2Perf reconcile: %d unit(s) had un-cleared current_buf_ptr — see prior errors", leftover_active);
+        LOG_ERROR(
+            "L2Swimlane reconcile: %d unit(s) had un-cleared current_buf_ptr — see prior errors", leftover_active
+        );
     }
 
     // Cross-check device-side totals against host CSV.  PERF and PHASE
@@ -391,7 +393,7 @@ void L2PerfCollector::reconcile_counters() {
         uint64_t dropped_device = 0;
         uint64_t mismatch_device = 0;
         for (int i = 0; i < unit_count; i++) {
-            L2PerfBufferState *state = get_state(i);
+            L2SwimlaneAicpuTaskPool *state = get_state(i);
             total_device += state->total_record_count;
             dropped_device += state->dropped_record_count;
             mismatch_device += state->mismatch_record_count;
@@ -403,14 +405,14 @@ void L2PerfCollector::reconcile_counters() {
 
         if (dropped_device > 0) {
             LOG_WARN(
-                "L2Perf reconcile: %lu %s records dropped on device side (buffer full / "
+                "L2Swimlane reconcile: %lu %s records dropped on device side (buffer full / "
                 "ready_queue full / late FIN after flush).",
                 static_cast<unsigned long>(dropped_device), kind
             );
         }
         if (mismatch_device > 0) {
             LOG_ERROR(
-                "L2Perf reconcile: %lu %s records lost to AICore staging-slot task_id mismatch — "
+                "L2Swimlane reconcile: %lu %s records lost to AICore staging-slot task_id mismatch — "
                 "completion-before-dispatch invariant violated",
                 static_cast<unsigned long>(mismatch_device), kind
             );
@@ -418,7 +420,7 @@ void L2PerfCollector::reconcile_counters() {
         uint64_t accounted = collected + dropped_device + mismatch_device;
         if (accounted != total_device) {
             LOG_WARN(
-                "L2Perf reconcile: %s count mismatch (collected=%lu + dropped=%lu + mismatch=%lu != "
+                "L2Swimlane reconcile: %s count mismatch (collected=%lu + dropped=%lu + mismatch=%lu != "
                 "device_total=%lu, silent_loss=%ld)",
                 kind, static_cast<unsigned long>(collected), static_cast<unsigned long>(dropped_device),
                 static_cast<unsigned long>(mismatch_device), static_cast<unsigned long>(total_device),
@@ -426,8 +428,8 @@ void L2PerfCollector::reconcile_counters() {
             );
         } else {
             LOG_INFO_V0(
-                "L2Perf reconcile: %s counts match (collected=%lu, dropped=%lu, mismatch=%lu, device_total=%lu)", kind,
-                static_cast<unsigned long>(collected), static_cast<unsigned long>(dropped_device),
+                "L2Swimlane reconcile: %s counts match (collected=%lu, dropped=%lu, mismatch=%lu, device_total=%lu)",
+                kind, static_cast<unsigned long>(collected), static_cast<unsigned long>(dropped_device),
                 static_cast<unsigned long>(mismatch_device), static_cast<unsigned long>(total_device)
             );
         }
@@ -450,10 +452,10 @@ void L2PerfCollector::reconcile_counters() {
     );
 }
 
-void L2PerfCollector::read_phase_header_metadata() {
+void L2SwimlaneCollector::read_phase_header_metadata() {
     if (shm_host_ == nullptr) return;
 
-    // Pull the AicpuPhaseHeader portion from device (the mgmt loop's final
+    // Pull the L2SwimlaneAicpuPhaseHeader portion from device (the mgmt loop's final
     // mirror covered it, but re-mirror to be safe in case stop() raced with
     // a final write of core_to_thread mapping).
     if (manager_.shared_mem_dev() != nullptr && shm_size_ > 0) {
@@ -461,11 +463,12 @@ void L2PerfCollector::read_phase_header_metadata() {
     }
     rmb();
 
-    AicpuPhaseHeader *phase_header = get_phase_header(shm_host_, num_aicore_);
+    L2SwimlaneAicpuPhaseHeader *phase_header = get_phase_header(shm_host_, num_aicore_);
 
-    if (phase_header->magic != AICPU_PHASE_MAGIC) {
+    if (phase_header->magic != L2_SWIMLANE_AICPU_PHASE_MAGIC) {
         LOG_INFO_V0(
-            "No phase profiling data found (magic mismatch: 0x%x vs 0x%x)", phase_header->magic, AICPU_PHASE_MAGIC
+            "No phase profiling data found (magic mismatch: 0x%x vs 0x%x)", phase_header->magic,
+            L2_SWIMLANE_AICPU_PHASE_MAGIC
         );
         return;
     }
@@ -511,7 +514,7 @@ void L2PerfCollector::read_phase_header_metadata() {
 // export_swimlane_json
 // ---------------------------------------------------------------------------
 
-int L2PerfCollector::export_swimlane_json() {
+int L2SwimlaneCollector::export_swimlane_json() {
     bool has_any_records = false;
     for (const auto &core_records : collected_perf_records_) {
         if (!core_records.empty()) {
@@ -532,7 +535,7 @@ int L2PerfCollector::export_swimlane_json() {
     }
 
     struct TaggedRecord {
-        const L2PerfRecord *record;
+        const L2SwimlaneAicpuTaskRecord *record;
         uint32_t core_id;
     };
     std::vector<TaggedRecord> tagged_records;
@@ -571,7 +574,7 @@ int L2PerfCollector::export_swimlane_json() {
         }
     }
 
-    std::string filepath = output_prefix_ + "/l2_perf_records.json";
+    std::string filepath = output_prefix_ + "/l2_swimlane_records.json";
 
     std::ofstream outfile(filepath);
     if (!outfile.is_open()) {
@@ -579,9 +582,9 @@ int L2PerfCollector::export_swimlane_json() {
         return -1;
     }
 
-    int l2_perf_level = static_cast<int>(l2_perf_level_);
+    int l2_swimlane_level = static_cast<int>(l2_swimlane_level_);
     outfile << "{\n";
-    outfile << "  \"l2_perf_level\": " << l2_perf_level << ",\n";
+    outfile << "  \"l2_swimlane_level\": " << l2_swimlane_level << ",\n";
     outfile << "  \"tasks\": [\n";
 
     for (size_t i = 0; i < tagged_records.size(); ++i) {
@@ -627,41 +630,41 @@ int L2PerfCollector::export_swimlane_json() {
     outfile << "  ]";
 
     // Step: Write phase profiling data (level >= 3)
-    if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-        auto sched_phase_name = [](AicpuPhaseId id) -> const char * {
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        auto sched_phase_name = [](L2SwimlaneAicpuPhaseId id) -> const char * {
             switch (id) {
-            case AicpuPhaseId::SCHED_COMPLETE:
+            case L2SwimlaneAicpuPhaseId::SCHED_COMPLETE:
                 return "complete";
-            case AicpuPhaseId::SCHED_DISPATCH:
+            case L2SwimlaneAicpuPhaseId::SCHED_DISPATCH:
                 return "dispatch";
-            case AicpuPhaseId::SCHED_SCAN:
+            case L2SwimlaneAicpuPhaseId::SCHED_SCAN:
                 return "scan";
-            case AicpuPhaseId::SCHED_IDLE_WAIT:
+            case L2SwimlaneAicpuPhaseId::SCHED_IDLE_WAIT:
                 return "idle";
             default:
                 return "unknown";
             }
         };
 
-        auto orch_phase_name = [](AicpuPhaseId id) -> const char * {
+        auto orch_phase_name = [](L2SwimlaneAicpuPhaseId id) -> const char * {
             switch (id) {
-            case AicpuPhaseId::ORCH_SYNC:
+            case L2SwimlaneAicpuPhaseId::ORCH_SYNC:
                 return "orch_sync";
-            case AicpuPhaseId::ORCH_ALLOC:
+            case L2SwimlaneAicpuPhaseId::ORCH_ALLOC:
                 return "orch_alloc";
-            case AicpuPhaseId::ORCH_PARAMS:
+            case L2SwimlaneAicpuPhaseId::ORCH_PARAMS:
                 return "orch_params";
-            case AicpuPhaseId::ORCH_LOOKUP:
+            case L2SwimlaneAicpuPhaseId::ORCH_LOOKUP:
                 return "orch_lookup";
-            case AicpuPhaseId::ORCH_HEAP:
+            case L2SwimlaneAicpuPhaseId::ORCH_HEAP:
                 return "orch_heap";
-            case AicpuPhaseId::ORCH_INSERT:
+            case L2SwimlaneAicpuPhaseId::ORCH_INSERT:
                 return "orch_insert";
-            case AicpuPhaseId::ORCH_FANIN:
+            case L2SwimlaneAicpuPhaseId::ORCH_FANIN:
                 return "orch_fanin";
-            case AicpuPhaseId::ORCH_FINALIZE:
+            case L2SwimlaneAicpuPhaseId::ORCH_FINALIZE:
                 return "orch_finalize";
-            case AicpuPhaseId::ORCH_SCOPE_END:
+            case L2SwimlaneAicpuPhaseId::ORCH_SCOPE_END:
                 return "orch_scope_end";
             default:
                 return "unknown";
@@ -684,7 +687,7 @@ int L2PerfCollector::export_swimlane_json() {
                 // Phase-specific deltas (currently only SCHED_DISPATCH carries
                 // pop_hit / pop_miss). Other phases pass zero extras; omitting
                 // them keeps the JSON terse per record.
-                if (pr.phase_id == AicpuPhaseId::SCHED_DISPATCH) {
+                if (pr.phase_id == L2SwimlaneAicpuPhaseId::SCHED_DISPATCH) {
                     outfile << ", \"pop_hit\": " << pr.extra1 << ", \"pop_miss\": " << pr.extra2;
                 }
                 outfile << "}";
@@ -699,12 +702,12 @@ int L2PerfCollector::export_swimlane_json() {
 
         // Per-task orchestrator phase records (level >= 4, filtered from unified collected_phase_records_)
         // Orchestrator timing is no longer emitted as a separate aggregate
-        // block. Per-event AicpuPhaseRecord[] entries (emitted as
+        // block. Per-event L2SwimlaneAicpuPhaseRecord[] entries (emitted as
         // aicpu_orchestrator_phases below) are the single source of truth;
         // the run-window envelope is still visible in the device-side
         // LOG_INFO_V9 "Thread N: orch_start=… orch_end=… orch_cost=…" line.
         bool has_orch_phases = false;
-        if (l2_perf_level_ >= L2PerfLevel::ORCH_PHASES) {
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
             for (const auto &v : collected_phase_records_) {
                 for (const auto &r : v) {
                     if (!is_scheduler_phase(r.phase_id)) {
@@ -764,7 +767,7 @@ int L2PerfCollector::export_swimlane_json() {
 // finalize
 // ---------------------------------------------------------------------------
 
-int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2PerfFreeCallback &free_cb) {
+int L2SwimlaneCollector::finalize(L2SwimlaneUnregisterCallback unregister_cb, const L2SwimlaneFreeCallback &free_cb) {
     if (shm_host_ == nullptr) return 0;
 
     // Stop mgmt + collector threads if the caller didn't already (idempotent).
@@ -782,7 +785,7 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe
     // shadow stays in dev_to_host_ and is freed by clear_mappings() below
     // (single source of truth for shadow lifetime, no double-free risk).
     for (int i = 0; i < num_aicore_; i++) {
-        L2PerfBufferState *state = get_perf_buffer_state(shm_host_, i);
+        L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(shm_host_, i);
 
         release_dev(reinterpret_cast<void *>(state->current_buf_ptr));
         state->current_buf_ptr = 0;
@@ -804,7 +807,7 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe
 
     int num_phase_threads = PLATFORM_MAX_AICPU_THREADS;
     for (int t = 0; t < num_phase_threads; t++) {
-        PhaseBufferState *state = get_phase_buffer_state(shm_host_, num_aicore_, t);
+        L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm_host_, num_aicore_, t);
 
         release_dev(reinterpret_cast<void *>(state->current_buf_ptr));
         state->current_buf_ptr = 0;
@@ -831,7 +834,7 @@ int L2PerfCollector::finalize(L2PerfUnregisterCallback unregister_cb, const L2Pe
         release_dev(p);
     });
 
-    // Free per-core L2PerfAicoreRings (no host shadow paired). The rings
+    // Free per-core L2SwimlaneAicoreRings (no host shadow paired). The rings
     // were allocated directly via alloc_cb (not alloc_single_buffer), so no
     // entry exists in dev_to_host_ for them.
     for (auto *ring_dev : aicore_rings_dev_) {
diff --git a/src/a5/platform/src/host/pmu_collector.cpp b/src/a5/platform/src/host/pmu_collector.cpp
index 1468afa01..e6af94941 100644
--- a/src/a5/platform/src/host/pmu_collector.cpp
+++ b/src/a5/platform/src/host/pmu_collector.cpp
@@ -317,7 +317,7 @@ void PmuCollector::reconcile_counters() {
 
     // Cross-check device-side totals against host CSV.  PMU is single-kind
     // (one per-core pool), so reconcile_one is invoked once; the lambda
-    // shape matches L2PerfCollector::reconcile_counters so the two
+    // shape matches L2SwimlaneCollector::reconcile_counters so the two
     // single-arch implementations stay diff-able.
     auto reconcile_one = [&](int unit_count, auto get_state, uint64_t collected, bool optional) {
         uint64_t total_device = 0;
diff --git a/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp
index 583dc1da7..32e1ff714 100644
--- a/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp
+++ b/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp
@@ -11,9 +11,9 @@
 
 #include "aicore/aicore.h"
 #include "aicore/aicore_profiling_state.h"
-#include "aicore/l2_perf_collector_aicore.h"
+#include "aicore/l2_swimlane_collector_aicore.h"
 #include "aicore/pmu_collector_aicore.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"  // Platform configuration (C/C++ compatible)
 #include "common/pmu_profiling.h"
 #include "runtime.h"
@@ -60,10 +60,10 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
     // AICore kernel entry from KernelArgs::regs[physical_core_id]), so
     // they are safe to cache here.
     uint32_t profiling_flag = get_aicore_profiling_flag();
-    bool l2_perf_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
+    bool l2_swimlane_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
     bool dump_tensor_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
     bool pmu_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_PMU);
-    __gm__ L2PerfAicoreRing *l2_perf_ring = l2_perf_enabled ? get_aicore_l2_perf_ring() : nullptr;
+    __gm__ L2SwimlaneAicoreRing *l2_swimlane_ring = l2_swimlane_enabled ? get_aicore_l2_swimlane_ring() : nullptr;
     __gm__ PmuAicoreRing *pmu_ring = pmu_enabled ? get_aicore_pmu_ring() : nullptr;
     uint64_t pmu_reg_base = pmu_enabled ? get_aicore_pmu_reg_base() : 0;
 
@@ -105,9 +105,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
                 pipe_barrier(PIPE_ALL);
             }
 
-            if (l2_perf_enabled) {
+            if (l2_swimlane_enabled) {
                 uint64_t end_time = get_sys_cnt_aicore();
-                l2_perf_aicore_record_task(l2_perf_ring, actual_task_id, start_time, end_time);
+                l2_swimlane_aicore_record_task(l2_swimlane_ring, actual_task_id, start_time, end_time);
             }
 
             last_task_id = task_id;
diff --git a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp
index a073cd7c2..729944ea7 100644
--- a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp
@@ -16,13 +16,13 @@
 
 #include "aicpu/device_log.h"
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
 #include "aicpu/platform_regs.h"
 #include "callable.h"
 #include "common/memory_barrier.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 #include "runtime.h"
@@ -141,7 +141,7 @@ struct AicpuExecutor {
 
     inline bool try_dispatch_task(
         int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head,
-        int &ready_count, bool l2_perf_enabled
+        int &ready_count, bool l2_swimlane_enabled
     );
 };
 
@@ -243,7 +243,7 @@ inline void AicpuExecutor::resolve_task_dependencies(
 // Try to dispatch a task from thread-local queue to a core
 inline bool AicpuExecutor::try_dispatch_task(
     int core_id, uint64_t reg_addr, CoreType core_type, int thread_idx, int *local_queue, int &head, int &ready_count,
-    bool l2_perf_enabled
+    bool l2_swimlane_enabled
 ) {
     if (ready_count <= 0) {
         return false;
@@ -286,7 +286,7 @@ inline bool AicpuExecutor::try_dispatch_task(
     pending_task_ids_[core_id] = task_id;
 
     // Record the real AICPU dispatch point for this core.
-    if (l2_perf_enabled && get_l2_perf_level() >= L2PerfLevel::AICPU_TIMING) {
+    if (l2_swimlane_enabled && get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) {
         dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
     }
 
@@ -359,7 +359,7 @@ int AicpuExecutor::init(Runtime *runtime) {
         dispatch_timestamps_[i] = 0;
     }
     if (is_l2_swimlane_enabled()) {
-        l2_perf_aicpu_init(runtime->worker_count);
+        l2_swimlane_aicpu_init(runtime->worker_count);
     }
 #if PTO2_PROFILING
     if (is_dump_tensor_enabled()) {
@@ -681,8 +681,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
 
     int verification_warning_count = 0;
     const int MAX_VERIFICATION_WARNINGS = 10;
-    bool l2_perf_enabled = is_l2_swimlane_enabled();
-    L2PerfLevel l2_perf_level = get_l2_perf_level();
+    bool l2_swimlane_enabled = is_l2_swimlane_enabled();
+    L2SwimlaneLevel l2_swimlane_level = get_l2_swimlane_level();
 
     // Extract array pointers as local variables for better readability and performance
     int *cur_ready_queue_aic = cur_ready_queue_aic_[thread_idx];
@@ -704,7 +704,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
     );
 
     // Initialize dispatch timestamps for all cores (only needed at level >= 2)
-    if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+    if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
         uint64_t dispatch_start_time = get_sys_cnt_aicpu();
         for (int i = 0; i < core_num; i++) {
             int core_id = cur_thread_cores[i];
@@ -741,54 +741,54 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 // Profiling: when prev_running_id exists, its AICore timing was
                 // written to wip[id & 1] first, so complete it BEFORE the
                 // pending task's record to maintain buffer ordering.
-                if (l2_perf_enabled) {
-                    uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
+                if (l2_swimlane_enabled) {
+                    uint64_t finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
 
                     if (prev_running_id != AICPU_TASK_INVALID) {
                         Task *prev_task = &runtime.tasks[prev_running_id];
                         uint64_t fanout_arr[RUNTIME_MAX_FANOUT];
                         int fanout_count = 0;
-                        if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                        if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                             for (int i = 0; i < prev_task->fanout_count; i++) {
                                 fanout_arr[i] = static_cast<uint64_t>(prev_task->fanout[i]);
                             }
                             fanout_count = prev_task->fanout_count;
                         }
-                        if (l2_perf_aicpu_complete_record(
+                        if (l2_swimlane_aicpu_complete_task(
                                 core_id, thread_idx, static_cast<uint32_t>(prev_running_id),
                                 static_cast<uint64_t>(prev_running_id), prev_task->func_id, h->core_type,
                                 dispatch_timestamps_[core_id], finish_ts, fanout_arr, fanout_count
                             ) != 0) {
                             LOG_ERROR(
-                                "Core %d: l2_perf_aicpu_complete_record failed for implicit task %d", core_id,
+                                "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id,
                                 prev_running_id
                             );
                         }
-                        if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                        if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                             dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                         }
                     }
 
-                    finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
+                    finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
                     Task *task = &runtime.tasks[completed_task_id];
                     uint64_t fanout_arr[RUNTIME_MAX_FANOUT];
                     int fanout_count = 0;
-                    if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                    if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                         for (int i = 0; i < task->fanout_count; i++) {
                             fanout_arr[i] = static_cast<uint64_t>(task->fanout[i]);
                         }
                         fanout_count = task->fanout_count;
                     }
-                    if (l2_perf_aicpu_complete_record(
+                    if (l2_swimlane_aicpu_complete_task(
                             core_id, thread_idx, static_cast<uint32_t>(completed_task_id),
                             static_cast<uint64_t>(completed_task_id), task->func_id, h->core_type,
                             dispatch_timestamps_[core_id], finish_ts, fanout_arr, fanout_count
                         ) != 0) {
                         LOG_ERROR(
-                            "Core %d: l2_perf_aicpu_complete_record failed for task %d", core_id, completed_task_id
+                            "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id
                         );
                     }
-                    if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                    if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                         dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                     }
                 }
@@ -805,12 +805,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) {
                     dispatched = try_dispatch_task(
                         core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head,
-                        cur_aic_ready_count, l2_perf_enabled
+                        cur_aic_ready_count, l2_swimlane_enabled
                     );
                 } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) {
                     dispatched = try_dispatch_task(
                         core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head,
-                        cur_aiv_ready_count, l2_perf_enabled
+                        cur_aiv_ready_count, l2_swimlane_enabled
                     );
                 }
 
@@ -842,7 +842,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 made_progress = true;
 
                 // Update timestamp if didn't dispatch (try_dispatch_task updates it if dispatched)
-                if (!dispatched && l2_perf_enabled && l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                if (!dispatched && l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                     dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                 }
             } else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) {
@@ -864,28 +864,29 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 // Count it here to avoid losing completion.
                 if (prev_running_id != AICPU_TASK_INVALID) {
                     // Profiling: complete the implicit task's AICore record
-                    if (l2_perf_enabled) {
-                        uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
+                    if (l2_swimlane_enabled) {
+                        uint64_t finish_ts =
+                            (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
                         Task *prev_task = &runtime.tasks[prev_running_id];
                         uint64_t fanout_arr[RUNTIME_MAX_FANOUT];
                         int fanout_count = 0;
-                        if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                        if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                             for (int i = 0; i < prev_task->fanout_count; i++) {
                                 fanout_arr[i] = static_cast<uint64_t>(prev_task->fanout[i]);
                             }
                             fanout_count = prev_task->fanout_count;
                         }
-                        if (l2_perf_aicpu_complete_record(
+                        if (l2_swimlane_aicpu_complete_task(
                                 core_id, thread_idx, static_cast<uint32_t>(prev_running_id),
                                 static_cast<uint64_t>(prev_running_id), prev_task->func_id, h->core_type,
                                 dispatch_timestamps_[core_id], finish_ts, fanout_arr, fanout_count
                             ) != 0) {
                             LOG_ERROR(
-                                "Core %d: l2_perf_aicpu_complete_record failed for implicit task %d", core_id,
+                                "Core %d: l2_swimlane_aicpu_complete_task failed for implicit task %d", core_id,
                                 prev_running_id
                             );
                         }
-                        if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                        if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                             dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                         }
                     }
@@ -915,27 +916,27 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
 
                 int completed_task_id = running_task_ids_[core_id];
 
-                if (l2_perf_enabled) {
-                    uint64_t finish_ts = (l2_perf_level >= L2PerfLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
+                if (l2_swimlane_enabled) {
+                    uint64_t finish_ts = (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) ? get_sys_cnt_aicpu() : 0;
                     Task *task = &runtime.tasks[completed_task_id];
                     uint64_t fanout_arr[RUNTIME_MAX_FANOUT];
                     int fanout_count = 0;
-                    if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                    if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                         for (int i = 0; i < task->fanout_count; i++) {
                             fanout_arr[i] = static_cast<uint64_t>(task->fanout[i]);
                         }
                         fanout_count = task->fanout_count;
                     }
-                    if (l2_perf_aicpu_complete_record(
+                    if (l2_swimlane_aicpu_complete_task(
                             core_id, thread_idx, static_cast<uint32_t>(completed_task_id),
                             static_cast<uint64_t>(completed_task_id), task->func_id, h->core_type,
                             dispatch_timestamps_[core_id], finish_ts, fanout_arr, fanout_count
                         ) != 0) {
                         LOG_ERROR(
-                            "Core %d: l2_perf_aicpu_complete_record failed for task %d", core_id, completed_task_id
+                            "Core %d: l2_swimlane_aicpu_complete_task failed for task %d", core_id, completed_task_id
                         );
                     }
-                    if (l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                    if (l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                         dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                     }
                 }
@@ -950,12 +951,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                     if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) {
                         dispatched = try_dispatch_task(
                             core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head,
-                            cur_aic_ready_count, l2_perf_enabled
+                            cur_aic_ready_count, l2_swimlane_enabled
                         );
                     } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) {
                         dispatched = try_dispatch_task(
                             core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head,
-                            cur_aiv_ready_count, l2_perf_enabled
+                            cur_aiv_ready_count, l2_swimlane_enabled
                         );
                     }
                 }
@@ -969,7 +970,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 made_progress = true;
 
                 // Update timestamp if didn't dispatch (try_dispatch_task updates it if dispatched)
-                if (!dispatched && l2_perf_enabled && l2_perf_level >= L2PerfLevel::AICPU_TIMING) {
+                if (!dispatched && l2_swimlane_enabled && l2_swimlane_level >= L2SwimlaneLevel::AICPU_TIMING) {
                     dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                 }
             }
@@ -979,14 +980,14 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
                 if (h->core_type == CoreType::AIC && cur_aic_ready_count > 0) {
                     if (try_dispatch_task(
                             core_id, reg_addr, CoreType::AIC, thread_idx, cur_ready_queue_aic, cur_aic_head,
-                            cur_aic_ready_count, l2_perf_enabled
+                            cur_aic_ready_count, l2_swimlane_enabled
                         )) {
                         made_progress = true;
                     }
                 } else if (h->core_type == CoreType::AIV && cur_aiv_ready_count > 0) {
                     if (try_dispatch_task(
                             core_id, reg_addr, CoreType::AIV, thread_idx, cur_ready_queue_aiv, cur_aiv_head,
-                            cur_aiv_ready_count, l2_perf_enabled
+                            cur_aiv_ready_count, l2_swimlane_enabled
                         )) {
                         made_progress = true;
                     }
@@ -1125,7 +1126,7 @@ int AicpuExecutor::run(Runtime *runtime) {
 
     // Flush performance buffers for cores managed by this thread.
     if (is_l2_swimlane_enabled()) {
-        l2_perf_aicpu_flush_buffers(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]);
+        l2_swimlane_aicpu_flush(thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]);
     }
 #if PTO2_PROFILING
     if (is_pmu_enabled()) {
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index bb4a98e91..4583175d0 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -11,9 +11,9 @@
 
 #include "aicore/aicore.h"
 #include "aicore/aicore_profiling_state.h"
-#include "aicore/l2_perf_collector_aicore.h"
+#include "aicore/l2_swimlane_collector_aicore.h"
 #include "aicore/pmu_collector_aicore.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"  // Register-based communication
 #include "common/pmu_profiling.h"
 #include "pto2_dispatch_payload.h"
@@ -98,10 +98,10 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
     // AICore kernel entry from KernelArgs::regs[physical_core_id]), so
     // they are safe to cache here.
     uint32_t profiling_flag = get_aicore_profiling_flag();
-    bool l2_perf_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
+    bool l2_swimlane_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
     bool dump_tensor_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
     bool pmu_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_PMU);
-    __gm__ L2PerfAicoreRing *l2_perf_ring = l2_perf_enabled ? get_aicore_l2_perf_ring() : nullptr;
+    __gm__ L2SwimlaneAicoreRing *l2_swimlane_ring = l2_swimlane_enabled ? get_aicore_l2_swimlane_ring() : nullptr;
     __gm__ PmuAicoreRing *pmu_ring = pmu_enabled ? get_aicore_pmu_ring() : nullptr;
     uint64_t pmu_reg_base = pmu_enabled ? get_aicore_pmu_reg_base() : 0;
 
@@ -155,9 +155,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
             }
 
             // Performance profiling: record task execution
-            if (l2_perf_enabled) {
+            if (l2_swimlane_enabled) {
                 uint64_t end_time = get_sys_cnt_aicore();
-                l2_perf_aicore_record_task(l2_perf_ring, task_id, start_time, end_time);
+                l2_swimlane_aicore_record_task(l2_swimlane_ring, task_id, start_time, end_time);
             }
 
             last_reg_val = reg_val;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 92dd7db82..8a8a88816 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -35,10 +35,10 @@
 #include "pto_shared_memory.h"
 
 // Performance profiling headers
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/scope_stats_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/unified_log.h"
 
 // Register-based communication
@@ -523,7 +523,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
 
 #if PTO2_PROFILING
-            rt->orchestrator.l2_perf_level = get_l2_perf_level();
+            rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level();
             {
                 auto &orch = rt->orchestrator;
                 for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
@@ -549,8 +549,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             sched_ctx_.wait_init_complete();
 
 #if PTO2_PROFILING
-            if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) {
-                l2_perf_aicpu_set_orch_thread_idx(thread_idx);
+            if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) {
+                l2_swimlane_aicpu_set_orch_thread_idx(thread_idx);
             }
             // scope_stats streams scope_end records off the orchestrator thread:
             // record the per-thread ready_queue index. No-op (writer shared
@@ -648,7 +648,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
             // below carries the same envelope info for debugging, and
             // host-side swimlane derives per-phase timing from the per-event
-            // AicpuPhaseRecord[] stream that already covers everything inside
+            // L2SwimlaneAicpuPhaseRecord[] stream that already covers everything inside
             // submit_task().
             int32_t total_tasks = 0;
             if (rt->orchestrator.sm_header) {
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index 79ae71b24..d8ec5f736 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -166,8 +166,8 @@ Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX
 ```
 
 Per-thread fanout / fanin edge counts and ready-queue pop hit / miss
-stats live in `aicpu_scheduler_phases[]` (in `l2_perf_records.json`
-captured at l2_perf_level >= 3) and `deps.json`; consume them via
+stats live in `aicpu_scheduler_phases[]` (in `l2_swimlane_records.json`
+captured at l2_swimlane_level >= 3) and `deps.json`; consume them via
 `simpler_setup/tools/sched_overhead_analysis.py`.
 
 ---
@@ -241,10 +241,10 @@ mirrors the PMU pattern — two independent channels (one binary, one int):
   (`PROFILING_FLAG_L2_SWIMLANE`). Set by the host whenever level > 0; read
   by AICore (which only needs on/off to decide whether to write timing) and
   by AICPU kernel entry via `set_l2_swimlane_enabled(bool)`.
-- **Granular level (0–4)** — `L2PerfDataHeader::l2_perf_level`
-  (shared memory). Host writes it in `L2PerfCollector::initialize`; AICPU
-  promotes it from the header in `l2_perf_aicpu_init` and exposes it via
-  `get_l2_perf_level()` (typed `L2PerfLevel`) for
+- **Granular level (0–4)** — `L2SwimlaneDataHeader::l2_swimlane_level`
+  (shared memory). Host writes it in `L2SwimlaneCollector::initialize`; AICPU
+  promotes it from the header in `l2_swimlane_aicpu_init` and exposes it via
+  `get_l2_swimlane_level()` (typed `L2SwimlaneLevel`) for
   `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates.
 
 On sim, the binary on/off travels via the dlsym'd `set_l2_swimlane_enabled`
@@ -263,7 +263,7 @@ Bare `--enable-l2-swimlane` = level 4 (backward compatible).
 
 ### Level gating in AICPU code
 
-Use the strongly-typed `L2PerfLevel` enum so each gate names the
+Use the strongly-typed `L2SwimlaneLevel` enum so each gate names the
 content it depends on instead of relying on magic numbers:
 
 ```cpp
@@ -272,19 +272,19 @@ content it depends on instead of relying on magic numbers:
 if (is_l2_swimlane_enabled()) { ... }
 
 // AICPU dispatch/finish timestamps + fanout.
-// Granular checks below require l2_perf_aicpu_init to have already run
+// Granular checks below require l2_swimlane_aicpu_init to have already run
 // (so the level has been promoted from the shared-memory header).
-if (get_l2_perf_level() >= L2PerfLevel::AICPU_TIMING) { ... }
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { ... }
 
 // Scheduler main-loop phase records (SCHED_*)
-if (get_l2_perf_level() >= L2PerfLevel::SCHED_PHASES) { ... }
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::SCHED_PHASES) { ... }
 
 // Orchestrator phase records
-if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { ... }
+if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { ... }
 ```
 
-`L2PerfLevel` is defined in `common/l2_perf_profiling.h` with
-underlying type `uint32_t` (matches the `L2PerfDataHeader::l2_perf_level`
+`L2SwimlaneLevel` is defined in `common/l2_swimlane_profiling.h` with
+underlying type `uint32_t` (matches the `L2SwimlaneDataHeader::l2_swimlane_level`
 shared-memory field and mirrors `PmuEventType : uint32_t`):
 
 | Enumerator | Underlying value |
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index db06248e6..b2efb224e 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -52,7 +52,7 @@ extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabl
 // =============================================================================
 #if PTO2_ORCH_PROFILING
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 // Weak fallback for builds that don't link device_time.cpp (e.g. host).
 // The strong symbol from platform/.../device_time.cpp wins in the AICPU build.
 //
@@ -65,11 +65,11 @@ extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabl
 // so the AICPU .so's PLT resolves to its own strong definition from
 // device_time.cpp.
 __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-// Weak fallback for builds that don't link l2_perf_collector_aicpu.cpp.
+// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp.
 // The strong symbol from the AICPU build wins when profiling is available.
 // Also hidden to prevent HOST .so from polluting the global symbol table.
 __attribute__((weak, visibility("hidden"))) void
-l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
+l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
 // Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
 static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
 static uint64_t g_orch_alloc_cycle = 0;      // unified task+heap alloc
@@ -87,11 +87,11 @@ uint64_t g_orch_args_atomic_count = 0;
 uint64_t g_orch_scope_end_atomic_count = 0;
 // Cycle accumulation is unconditional under PTO2_ORCH_PROFILING (that's what
 // the flag is for). Swim-lane recording is an opt-in add-on gated at runtime
-// by l2_perf_level so callers can collect totals without paying GM-store cost.
+// by l2_swimlane_level so callers can collect totals without paying GM-store cost.
 // When the swim-lane write fires, _t0 is re-sampled from the counter *after*
 // the write so its cost is not attributed to the next phase's accumulator.
-#define CYCLE_COUNT_START()                                                \
-    bool _prof_active = (orch->l2_perf_level >= L2PerfLevel::ORCH_PHASES); \
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
     uint64_t _t0 = get_sys_cnt_aicpu(), _t1
 #define CYCLE_COUNT_LAP(acc)       \
     do {                           \
@@ -99,38 +99,38 @@ uint64_t g_orch_scope_end_atomic_count = 0;
         acc += (_t1 - _t0);        \
         _t0 = _t1;                 \
     } while (0)
-#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)                                           \
-    do {                                                                                     \
-        _t1 = get_sys_cnt_aicpu();                                                           \
-        acc += (_t1 - _t0);                                                                  \
-        if (_prof_active) {                                                                  \
-            l2_perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \
-            _t0 = get_sys_cnt_aicpu();                                                       \
-        } else {                                                                             \
-            _t0 = _t1;                                                                       \
-        }                                                                                    \
+#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)                                               \
+    do {                                                                                         \
+        _t1 = get_sys_cnt_aicpu();                                                               \
+        acc += (_t1 - _t0);                                                                      \
+        if (_prof_active) {                                                                      \
+            l2_swimlane_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \
+            _t0 = get_sys_cnt_aicpu();                                                           \
+        } else {                                                                                 \
+            _t0 = _t1;                                                                           \
+        }                                                                                        \
     } while (0)
 #elif PTO2_PROFILING
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
 __attribute__((weak, visibility("hidden"))) void
-l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
+l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
 // submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
 static uint32_t g_orch_submit_idx = 0;
-#define CYCLE_COUNT_START()                                                \
-    bool _prof_active = (orch->l2_perf_level >= L2PerfLevel::ORCH_PHASES); \
+#define CYCLE_COUNT_START()                                                        \
+    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
     uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0
 #define CYCLE_COUNT_LAP(acc) \
     do {                     \
     } while (0)
-#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)                                           \
-    do {                                                                                     \
-        if (_prof_active) {                                                                  \
-            _t1 = get_sys_cnt_aicpu();                                                       \
-            l2_perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \
-            _t0 = _t1;                                                                       \
-        }                                                                                    \
+#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)                                               \
+    do {                                                                                         \
+        if (_prof_active) {                                                                      \
+            _t1 = get_sys_cnt_aicpu();                                                           \
+            l2_swimlane_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \
+            _t0 = _t1;                                                                           \
+        }                                                                                        \
     } while (0)
 #else
 #define CYCLE_COUNT_START()
@@ -469,7 +469,7 @@ void PTO2OrchestratorState::end_scope() {
 #if PTO2_ORCH_PROFILING
     uint64_t _se1 = get_sys_cnt_aicpu();
     g_orch_scope_end_cycle += (_se1 - _se0);
-    // l2_perf_aicpu_record_orch_phase(AicpuPhaseId::ORCH_SCOPE_END, _se0, _se1, g_orch_submit_idx, -1);
+    // l2_swimlane_aicpu_record_orch_phase(L2SwimlaneAicpuPhaseId::ORCH_SCOPE_END, _se0, _se1, g_orch_submit_idx, -1);
 #endif
 }
 
@@ -504,7 +504,7 @@ static TaskOutputTensors submit_task_common(
 
     PTO2FaninBuilder fanin_builder(orch->rings[ring_id].fanin_pool);
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, task_id.raw);
+    CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, L2SwimlaneAicpuPhaseId::ORCH_ALLOC, task_id.raw);
 
 #if PTO2_PROFILING
     if (layout.total_output_size > 0) {
@@ -519,7 +519,7 @@ static TaskOutputTensors submit_task_common(
 
     orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, task_id.raw);
+    CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, L2SwimlaneAicpuPhaseId::ORCH_SYNC, task_id.raw);
 
     for (uint32_t i = 0; i < args.explicit_dep_count(); i++) {
         PTO2TaskId dep_task_id = args.explicit_dep(i);
@@ -557,12 +557,12 @@ static TaskOutputTensors submit_task_common(
         return result;
     }
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, AicpuPhaseId::ORCH_LOOKUP, task_id.raw);
+    CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, L2SwimlaneAicpuPhaseId::ORCH_LOOKUP, task_id.raw);
 
     // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) ===
     register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, task_id.raw);
+    CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, L2SwimlaneAicpuPhaseId::ORCH_INSERT, task_id.raw);
 
     // === STEP 5: Batch-write to GM (single cache line burst) ===
     // Deferred from allocation phase to avoid scattered GM writes that get
@@ -603,7 +603,7 @@ static TaskOutputTensors submit_task_common(
     }
 #endif
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw);
+    CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, L2SwimlaneAicpuPhaseId::ORCH_PARAMS, task_id.raw);
 #if PTO2_ORCH_PROFILING
     g_orch_args_atomic_count += 2;  // fanout_lock.store + fanout_count.store
 #endif
@@ -617,7 +617,7 @@ static TaskOutputTensors submit_task_common(
         SPIN_WAIT_HINT();
     }
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, task_id.raw);
+    CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, L2SwimlaneAicpuPhaseId::ORCH_FANIN, task_id.raw);
 
 #if PTO2_PROFILING
     orch->tasks_submitted++;
@@ -766,7 +766,7 @@ TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const Arg &args) {
     PTO2TaskDescriptor &task = *prepared.task;
     PTO2TaskPayload &payload = *prepared.payload;
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, prepared.task_id.raw);
+    CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, L2SwimlaneAicpuPhaseId::ORCH_ALLOC, prepared.task_id.raw);
 
 #if PTO2_PROFILING
     if (layout.total_output_size > 0) {
@@ -788,7 +788,7 @@ TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const Arg &args) {
     payload.fanin_actual_count = 0;
     payload.fanin_spill_start = 0;
     payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool;
-    CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, prepared.task_id.raw);
+    CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, L2SwimlaneAicpuPhaseId::ORCH_PARAMS, prepared.task_id.raw);
 
     if (prepared.slot_state != nullptr) {
         // Hidden alloc tasks complete inline in the orchestrator before any
@@ -803,7 +803,7 @@ TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const Arg &args) {
     }
     orch->inline_completed_tasks++;
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, prepared.task_id.raw);
+    CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, L2SwimlaneAicpuPhaseId::ORCH_FANIN, prepared.task_id.raw);
 
 #if PTO2_PROFILING
     orch->tasks_submitted++;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 9a73714c0..bfd4e7b30 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -28,7 +28,7 @@
 #pragma once
 
 #include "device_arena.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "pto_ring_buffer.h"
 #include "pto_runtime2_types.h"
 #include "pto_submit_types.h"
@@ -92,8 +92,8 @@ struct PTO2OrchestratorState {
     int32_t total_cluster_count{0};  // AIC cores = MIX clusters
     int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
 #if PTO2_PROFILING
-    // L2 perf_level copied from get_l2_perf_level().
-    L2PerfLevel l2_perf_level{L2PerfLevel::DISABLED};
+    // L2 swimlane_level copied from get_l2_swimlane_level().
+    L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED};
 #endif
 
     // === GM HEAP (for output buffers) ===
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index 6d0849e46..4db9245e5 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -15,11 +15,11 @@
 
 #include "common/unified_log.h"
 #include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "common/memory_barrier.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/platform_config.h"
 #include "pto_runtime2.h"
 #include "pto_shared_memory.h"
@@ -377,30 +377,33 @@ int32_t SchedulerContext::handle_timeout_exit(
 }
 
 #if PTO2_PROFILING
-void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_thread_completed) {
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) {
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
     uint64_t sched_end_ts = get_sys_cnt_aicpu();
     LOG_INFO_V9(
         "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx,
-        static_cast<uint64_t>(l2_perf.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
-        cycles_to_us(sched_end_ts - l2_perf.sched_start_ts)
+        static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
+        cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
     );
 
-    uint64_t sched_total = l2_perf.sched_wiring_cycle + l2_perf.sched_complete_cycle + l2_perf.sched_scan_cycle +
-                           l2_perf.sched_dispatch_cycle + l2_perf.sched_idle_cycle;
+    uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
+                           l2_swimlane.sched_scan_cycle + l2_swimlane.sched_dispatch_cycle +
+                           l2_swimlane.sched_idle_cycle;
     if (sched_total == 0) sched_total = 1;
 
 #if PTO2_SCHED_PROFILING
     {
         PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
         uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
-        uint64_t complete_poll = (l2_perf.sched_complete_cycle > otc_total + l2_perf.sched_complete_perf_cycle) ?
-                                     (l2_perf.sched_complete_cycle - otc_total - l2_perf.sched_complete_perf_cycle) :
-                                     0;
-        uint64_t dispatch_poll =
-            (l2_perf.sched_dispatch_cycle > l2_perf.sched_dispatch_pop_cycle + l2_perf.sched_dispatch_setup_cycle) ?
-                (l2_perf.sched_dispatch_cycle - l2_perf.sched_dispatch_pop_cycle - l2_perf.sched_dispatch_setup_cycle) :
+        uint64_t complete_poll =
+            (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ?
+                (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) :
                 0;
+        uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle >
+                                  l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ?
+                                     (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle -
+                                      l2_swimlane.sched_dispatch_setup_cycle) :
+                                     0;
 
         LOG_INFO_V9(
             "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx,
@@ -411,20 +414,21 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
         // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges
         // × core_to_thread).
         LOG_INFO_V9(
-            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_complete_cycle),
-            l2_perf.sched_complete_cycle * 100.0 / sched_total
+            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle),
+            l2_swimlane.sched_complete_cycle * 100.0 / sched_total
         );
 
-        uint64_t c_parent = l2_perf.sched_complete_cycle > 0 ? l2_perf.sched_complete_cycle : 1;
-        uint64_t complete_miss_count = (l2_perf.complete_probe_count > l2_perf.complete_hit_count) ?
-                                           (l2_perf.complete_probe_count - l2_perf.complete_hit_count) :
+        uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1;
+        uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ?
+                                           (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) :
                                            0;
-        double complete_hit_rate =
-            l2_perf.complete_probe_count > 0 ? l2_perf.complete_hit_count * 100.0 / l2_perf.complete_probe_count : 0.0;
+        double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ?
+                                       l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count :
+                                       0.0;
         LOG_INFO_V9(
             "Thread %d:     poll         : %.3fus (%.1f%%)  hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%",
             thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent,
-            static_cast<uint64_t>(l2_perf.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
+            static_cast<uint64_t>(l2_swimlane.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
             complete_hit_rate
         );
         LOG_INFO_V9(
@@ -451,7 +455,8 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
         );
         LOG_INFO_V9(
             "Thread %d:     perf         : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_perf.sched_complete_perf_cycle), l2_perf.sched_complete_perf_cycle * 100.0 / c_parent
+            cycles_to_us(l2_swimlane.sched_complete_perf_cycle),
+            l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent
         );
 
         // pop_hit / pop_miss per-emit deltas live in each dispatch-phase
@@ -459,70 +464,72 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
         // the run-cumulative tracked in this struct (final-drain emit covers
         // the trailing-idle tail).
         LOG_INFO_V9(
-            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_dispatch_cycle),
-            l2_perf.sched_dispatch_cycle * 100.0 / sched_total
+            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle),
+            l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total
         );
-        uint64_t global_dispatch_count = l2_perf.pop_hit - l2_perf.local_dispatch_count;
-        uint64_t total_dispatched = l2_perf.local_dispatch_count + global_dispatch_count;
-        double local_hit_rate = total_dispatched > 0 ? l2_perf.local_dispatch_count * 100.0 / total_dispatched : 0.0;
+        uint64_t global_dispatch_count = l2_swimlane.pop_hit - l2_swimlane.local_dispatch_count;
+        uint64_t total_dispatched = l2_swimlane.local_dispatch_count + global_dispatch_count;
+        double local_hit_rate =
+            total_dispatched > 0 ? l2_swimlane.local_dispatch_count * 100.0 / total_dispatched : 0.0;
         LOG_INFO_V9(
             "Thread %d:     local_disp   : local=%" PRIu64 ", global=%" PRIu64 ", overflow=%" PRIu64
             ", local_rate=%.1f%%",
-            thread_idx, static_cast<uint64_t>(l2_perf.local_dispatch_count),
-            static_cast<uint64_t>(global_dispatch_count), static_cast<uint64_t>(l2_perf.local_overflow_count),
+            thread_idx, static_cast<uint64_t>(l2_swimlane.local_dispatch_count),
+            static_cast<uint64_t>(global_dispatch_count), static_cast<uint64_t>(l2_swimlane.local_overflow_count),
             local_hit_rate
         );
 
-        uint64_t d_parent = l2_perf.sched_dispatch_cycle > 0 ? l2_perf.sched_dispatch_cycle : 1;
+        uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1;
         LOG_INFO_V9(
             "Thread %d:     poll         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll),
             dispatch_poll * 100.0 / d_parent
         );
         LOG_INFO_V9(
             "Thread %d:     pop          : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(l2_perf.sched_dispatch_pop_cycle), l2_perf.sched_dispatch_pop_cycle * 100.0 / d_parent,
-            cycles_to_us(l2_perf.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent,
+            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
             static_cast<uint64_t>(sp.pop_atomic_count)
         );
         LOG_INFO_V9(
             "Thread %d:     setup        : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_perf.sched_dispatch_setup_cycle), l2_perf.sched_dispatch_setup_cycle * 100.0 / d_parent
+            cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle),
+            l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent
         );
 
         LOG_INFO_V9(
-            "Thread %d:   scan           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_scan_cycle),
-            l2_perf.sched_scan_cycle * 100.0 / sched_total
+            "Thread %d:   scan           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_scan_cycle),
+            l2_swimlane.sched_scan_cycle * 100.0 / sched_total
         );
 
 #if PTO2_SCHED_PROFILING
         LOG_INFO_V9(
             "Thread %d:   wiring         : %.3fus (%.1f%%)  tasks=%d", thread_idx,
-            cycles_to_us(l2_perf.sched_wiring_cycle), l2_perf.sched_wiring_cycle * 100.0 / sched_total,
-            l2_perf.phase_wiring_count
+            cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total,
+            l2_swimlane.phase_wiring_count
         );
 #else
         LOG_INFO_V9(
-            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_wiring_cycle),
-            l2_perf.sched_wiring_cycle * 100.0 / sched_total
+            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle),
+            l2_swimlane.sched_wiring_cycle * 100.0 / sched_total
         );
 #endif
 
         LOG_INFO_V9(
-            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_idle_cycle),
-            l2_perf.sched_idle_cycle * 100.0 / sched_total
+            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle),
+            l2_swimlane.sched_idle_cycle * 100.0 / sched_total
         );
 
         if (cur_thread_completed > 0) {
             LOG_INFO_V9(
                 "Thread %d:   avg/complete   : %.3fus", thread_idx,
-                cycles_to_us(l2_perf.sched_complete_cycle) / cur_thread_completed
+                cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed
             );
         }
     }
 #endif
     LOG_INFO_V9(
         "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
-        cycles_to_us(sched_total), static_cast<uint64_t>(l2_perf.sched_loop_count), cur_thread_completed
+        cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
     );
 }
 #endif
@@ -837,18 +844,18 @@ int32_t SchedulerContext::init(
     regs_ = regs_base;
 
 #if PTO2_PROFILING
-    // l2_perf_aicpu_init promotes g_l2_perf_level from the shared-memory
+    // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory
     // header — must be called BEFORE the orchestrator thread caches the level
-    // via rt->orchestrator.l2_perf_level = get_l2_perf_level() in
+    // via rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level() in
     // AicpuExecutor::run(). Otherwise the cached value would still be DISABLED
     // (only the binary enable bit has been seeded by kernel.cpp at this point),
     // and the CYCLE_COUNT_START() gate in pto_orchestrator.cpp would suppress
     // all ORCH_PHASES records.
     if (is_l2_swimlane_enabled()) {
-        l2_perf_aicpu_init(runtime->worker_count);
-        l2_perf_level_ = get_l2_perf_level();
-        if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-            l2_perf_aicpu_init_phase(runtime->worker_count, sched_thread_num_);
+        l2_swimlane_aicpu_init(runtime->worker_count);
+        l2_swimlane_level_ = get_l2_swimlane_level();
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_thread_num_);
         }
     }
 #endif
@@ -973,9 +980,9 @@ void SchedulerContext::on_orchestration_done(
     Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
 ) {
 #if PTO2_PROFILING
-    if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
         // Flush orchestrator's phase record buffer
-        l2_perf_aicpu_flush_phase_buffers(thread_idx);
+        l2_swimlane_aicpu_flush_phase_buffers(thread_idx);
     }
 #endif
 
@@ -1028,10 +1035,10 @@ void SchedulerContext::on_orchestration_done(
     // Write core-to-thread mapping AFTER reassignment so the profiling data
     // reflects the final distribution (all active_sched_threads_, including
     // former orchestrator threads when orch_to_sched_ is enabled).
-    if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-        l2_perf_aicpu_init_core_assignments(cores_total_num_);
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
         for (int32_t t = 0; t < active_sched_threads_; t++) {
-            l2_perf_aicpu_write_core_assignments_for_thread(
+            l2_swimlane_aicpu_write_core_assignments_for_thread(
                 t, core_trackers_[t].core_ids(), core_trackers_[t].core_num()
             );
         }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
index 687d5f15d..5784f54dc 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
@@ -13,7 +13,7 @@
 #include "common/unified_log.h"
 #include "aicpu/device_time.h"
 #include "aicpu/platform_regs.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "pto_runtime2.h"
@@ -21,7 +21,7 @@
 #include "spin_hint.h"
 
 // Performance profiling headers
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
 
@@ -77,7 +77,7 @@ void SchedulerContext::complete_slot_task(
 #endif
 ) {
 #if PTO2_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #else
     (void)hank;
 #endif
@@ -130,7 +130,7 @@ void SchedulerContext::complete_slot_task(
         sched_->on_mixed_task_complete(slot_state, local_bufs);
 #endif
 #if PTO2_PROFILING
-        l2_perf.phase_complete_count++;
+        l2_swimlane.phase_complete_count++;
 #endif
         if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) {
             deferred_release_slot_states[deferred_release_count++] = &slot_state;
@@ -151,7 +151,7 @@ void SchedulerContext::complete_slot_task(
     }
 
 #if PTO2_PROFILING
-    if (l2_perf.l2_perf_enabled) {
+    if (l2_swimlane.l2_swimlane_enabled) {
 #if PTO2_SCHED_PROFILING
         uint64_t t_perf_start = get_sys_cnt_aicpu();
 #endif
@@ -159,7 +159,7 @@ void SchedulerContext::complete_slot_task(
         uint64_t fanout_arr[RUNTIME_MAX_FANOUT];
         int32_t fanout_n = 0;
 
-        if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) {
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
             finish_ts = get_sys_cnt_aicpu();
             PTO2DepListEntry *cur = slot_state.fanout_head;
             while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) {
@@ -169,18 +169,18 @@ void SchedulerContext::complete_slot_task(
         }
 
         int32_t perf_slot_idx = static_cast<int32_t>(subslot);
-        if (l2_perf_aicpu_complete_record(
+        if (l2_swimlane_aicpu_complete_task(
                 core_id, thread_idx, static_cast<uint32_t>(expected_reg_task_id), slot_state.task->task_id.raw,
                 slot_state.task->kernel_id[perf_slot_idx], hank[core_id].core_type, dispatch_ts, finish_ts, fanout_arr,
                 fanout_n
             ) != 0) {
             LOG_ERROR(
-                "Core %d: l2_perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id,
+                "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id,
                 static_cast<uint64_t>(slot_state.task->task_id.raw)
             );
         }
 #if PTO2_SCHED_PROFILING
-        l2_perf.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
+        l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
 #endif
     }
 #endif
@@ -224,7 +224,7 @@ void SchedulerContext::check_running_cores_for_completion(
     PTO2LocalReadyBuffer *local_bufs
 ) {
 #if PTO2_SCHED_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #endif
     CoreTracker &tracker = core_trackers_[thread_idx];
     auto running_core_states = tracker.get_all_running_cores();
@@ -246,8 +246,8 @@ void SchedulerContext::check_running_cores_for_completion(
         int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
 
 #if PTO2_SCHED_PROFILING
-        if (l2_perf.l2_perf_enabled) {
-            l2_perf.complete_probe_count++;
+        if (l2_swimlane.l2_swimlane_enabled) {
+            l2_swimlane.complete_probe_count++;
         }
 #endif
 
@@ -256,8 +256,8 @@ void SchedulerContext::check_running_cores_for_completion(
         if (!t.matched) continue;
 
 #if PTO2_SCHED_PROFILING
-        if (l2_perf.l2_perf_enabled && (t.running_done || t.pending_done)) {
-            l2_perf.complete_hit_count++;
+        if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) {
+            l2_swimlane.complete_hit_count++;
         }
 #endif
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 34271baef..0f6edf1aa 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -11,7 +11,7 @@
 #ifndef SCHEDULER_CONTEXT_H
 #define SCHEDULER_CONTEXT_H
 
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/unified_log.h"
 #include "scheduler_types.h"
 
@@ -134,10 +134,10 @@ class SchedulerContext {
     SyncStartDrainState drain_state_;
 
 #if PTO2_PROFILING
-    SchedL2PerfCounters sched_l2_perf_[MAX_AICPU_THREADS];
-    // Cached once at init() from get_l2_perf_level(), AFTER
-    // l2_perf_aicpu_init has promoted the level from the shared-memory header.
-    L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};
+    SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS];
+    // Cached once at init() from get_l2_swimlane_level(), AFTER
+    // l2_swimlane_aicpu_init has promoted the level from the shared-memory header.
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
 #endif
 
     // --- Task-execution tracking ---
@@ -353,7 +353,7 @@ class SchedulerContext {
     );
 
 #if PTO2_PROFILING
-    __attribute__((noinline, cold)) void log_l2_perf_summary(int32_t thread_idx, int32_t cur_thread_completed);
+    __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed);
 #endif
 
     // =========================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index 9ca15c9c1..0a3efe40f 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -18,7 +18,7 @@
 #include "aicpu/device_time.h"
 #include "aicpu/platform_regs.h"
 #include "callable.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "pto_runtime2.h"
@@ -26,7 +26,7 @@
 #include "spin_hint.h"
 
 // Performance profiling headers
-#include "aicpu/l2_perf_collector_aicpu.h"
+#include "aicpu/l2_swimlane_collector_aicpu.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
 
@@ -74,15 +74,15 @@ int SchedulerContext::pop_ready_tasks_batch(
     PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
 ) {
 #if PTO2_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #if PTO2_SCHED_PROFILING
     extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
     uint64_t t_pop_start = get_sys_cnt_aicpu();
     int count = sched_->get_ready_tasks_batch(
         shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx],
-        l2_perf.local_dispatch_count
+        l2_swimlane.local_dispatch_count
     );
-    l2_perf.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
+    l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
 #else
     int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
 #endif
@@ -90,9 +90,9 @@ int SchedulerContext::pop_ready_tasks_batch(
     // so dispatch-phase records in aicpu_scheduler_phases[] carry queue-health
     // stats on default builds.
     if (count > 0) {
-        l2_perf.pop_hit += count;
+        l2_swimlane.pop_hit += count;
     } else {
-        l2_perf.pop_miss++;
+        l2_swimlane.pop_miss++;
     }
 #else
     (void)thread_idx;
@@ -155,7 +155,7 @@ void SchedulerContext::dispatch_subtask_to_core(
         core_exec_state.pending_slot_state = &slot_state;
         core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
 #if PTO2_PROFILING
-        if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) {
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
             core_exec_state.pending_dispatch_timestamp = get_sys_cnt_aicpu();
         }
 #endif
@@ -164,7 +164,7 @@ void SchedulerContext::dispatch_subtask_to_core(
         core_exec_state.running_slot_state = &slot_state;
         core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
 #if PTO2_PROFILING
-        if (l2_perf_level_ >= L2PerfLevel::AICPU_TIMING) {
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
             core_exec_state.running_dispatch_timestamp = get_sys_cnt_aicpu();
         }
 #endif
@@ -247,7 +247,7 @@ void SchedulerContext::dispatch_block(
         );
     }
 #if PTO2_PROFILING
-    sched_l2_perf_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask());
+    sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask());
 #endif
 }
 
@@ -256,7 +256,7 @@ void SchedulerContext::dispatch_shape(
     PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
 ) {
 #if PTO2_SCHED_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #endif
     if (entered_drain) return;
 
@@ -324,7 +324,7 @@ void SchedulerContext::dispatch_shape(
             }
             made_progress = true;
 #if PTO2_SCHED_PROFILING
-            l2_perf.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
+            l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
 #endif
         }
 
@@ -353,7 +353,7 @@ void SchedulerContext::dispatch_ready_tasks(
     const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
 
 #if PTO2_SCHED_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
 #endif
 
     // Note: flush_local_bufs is invoked multiple times per pass (mid-function
@@ -367,7 +367,7 @@ void SchedulerContext::dispatch_ready_tasks(
         for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
             auto &lb = local_bufs[s];
 #if PTO2_SCHED_PROFILING
-            l2_perf.local_overflow_count += lb.count;
+            l2_swimlane.local_overflow_count += lb.count;
 #endif
             if (lb.count > 0) {
                 sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
@@ -481,9 +481,9 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     );
 
     // One-time init: assign perf buffers (one thread does it; others wait).
-    // l2_perf_aicpu_init / l2_perf_aicpu_init_phase already ran eagerly in
+    // l2_swimlane_aicpu_init / l2_swimlane_aicpu_init_phase already ran eagerly in
     // SchedulerContext::init() so the orchestrator thread can read the
-    // promoted g_l2_perf_level before caching it on rt->orchestrator. Only
+    // promoted g_l2_swimlane_level before caching it on rt->orchestrator. Only
     // dump_tensor / pmu init remain dispatch-time because they depend on
     // handshake-derived core IDs / counts.
     if (!init_done_.exchange(true, std::memory_order_acq_rel)) {
@@ -512,9 +512,9 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     int32_t idle_iterations = 0;
     int32_t last_progress_count = 0;
 #if PTO2_PROFILING
-    auto &l2_perf = sched_l2_perf_[thread_idx];
-    l2_perf.reset();
-    l2_perf.l2_perf_enabled = (l2_perf_level_ != L2PerfLevel::DISABLED);
+    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
+    l2_swimlane.reset();
+    l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
 #endif
 
     constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
@@ -529,7 +529,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     bool cores_released = false;
 
 #if PTO2_PROFILING
-    l2_perf.sched_start_ts = get_sys_cnt_aicpu();
+    l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
 #endif
 
     while (true) {
@@ -539,7 +539,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
         bool made_progress = false;
 #if PTO2_PROFILING
         CYCLE_COUNT_START();
-        l2_perf.sched_loop_count++;
+        l2_swimlane.sched_loop_count++;
         uint64_t _t0_phase = _t0;
 #endif
         int32_t task_count = 0;
@@ -554,7 +554,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
         }
 
 #if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle);
+        CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
 #endif
 
         // Phase 1: Check running cores for completion
@@ -616,16 +616,16 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 
 #if PTO2_PROFILING
         if (!try_completed) {
-            CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle);
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
         } else {
-            CYCLE_COUNT_LAP(l2_perf.sched_complete_cycle);
-            if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES && l2_perf.phase_complete_count > 0) {
-                l2_perf_aicpu_record_phase(
-                    thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, l2_perf.sched_loop_count,
-                    l2_perf.phase_complete_count
+            CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle);
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_complete_count > 0) {
+                l2_swimlane_aicpu_record_phase(
+                    thread_idx, L2SwimlaneAicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                    l2_swimlane.phase_complete_count
                 );
                 _t0_phase = _t1;
-                l2_perf.phase_complete_count = 0;
+                l2_swimlane.phase_complete_count = 0;
             }
         }
 #endif
@@ -644,12 +644,12 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             if (wired > 0) {
                 made_progress = true;
 #if PTO2_SCHED_PROFILING
-                l2_perf.phase_wiring_count += wired;
+                l2_swimlane.phase_wiring_count += wired;
 #endif
             }
         }
 #if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_perf.sched_wiring_cycle);
+        CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle);
 #endif
 
         // Phase 3b: Drain dummy ready queue (thread 0 only).
@@ -700,28 +700,28 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 
 #if PTO2_PROFILING
         if (!try_pushed) {
-            CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle);
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
         } else {
-            CYCLE_COUNT_LAP(l2_perf.sched_dispatch_cycle);
-            if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES && l2_perf.phase_dispatch_count > 0) {
+            CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle);
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_dispatch_count > 0) {
                 // Per-emit pop deltas via snapshot diff; the cumulative
                 // pop_hit / pop_miss stay intact for the cold-path log.
-                uint64_t pop_hit_delta = l2_perf.pop_hit - l2_perf.pop_hit_at_last_emit;
-                uint64_t pop_miss_delta = l2_perf.pop_miss - l2_perf.pop_miss_at_last_emit;
-                // AicpuPhaseRecord's extras are uint32 — a delta that overflows means
+                uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+                uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
+                // L2SwimlaneAicpuPhaseRecord's extras are uint32 — a delta that overflows means
                 // an emit was missed for ~4 billion pops, which is well outside any
                 // realistic dispatch cadence and silently truncates without this guard.
                 debug_assert(pop_hit_delta < (1ULL << 32));
                 debug_assert(pop_miss_delta < (1ULL << 32));
-                l2_perf_aicpu_record_phase(
-                    thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, l2_perf.sched_loop_count,
-                    l2_perf.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
+                l2_swimlane_aicpu_record_phase(
+                    thread_idx, L2SwimlaneAicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, l2_swimlane.sched_loop_count,
+                    l2_swimlane.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
                     static_cast<uint32_t>(pop_miss_delta)
                 );
                 _t0_phase = _t1;
-                l2_perf.phase_dispatch_count = 0;
-                l2_perf.pop_hit_at_last_emit = l2_perf.pop_hit;
-                l2_perf.pop_miss_at_last_emit = l2_perf.pop_miss;
+                l2_swimlane.phase_dispatch_count = 0;
+                l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+                l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
             }
         }
 #endif
@@ -756,17 +756,17 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
                     thread_idx, header, runtime, idle_iterations, last_progress_count
 #if PTO2_PROFILING
                     ,
-                    l2_perf.sched_start_ts
+                    l2_swimlane.sched_start_ts
 #endif
                 );
             } else {
                 SPIN_WAIT_HINT();
             }
 #if PTO2_PROFILING
-            CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle);
-            if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-                l2_perf_aicpu_record_phase(
-                    thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, l2_perf.sched_loop_count, 0
+            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
+            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+                l2_swimlane_aicpu_record_phase(
+                    thread_idx, L2SwimlaneAicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, l2_swimlane.sched_loop_count, 0
                 );
                 _t0_phase = _t1;
             }
@@ -794,31 +794,31 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     // sum(record.pop_*) reconciles with the run-cumulative counter.
     // Gate on SCHED_PHASES — at lower levels the phase buffer is never
     // flushed (see below), so writing this record would be wasted work.
-    if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-        uint64_t final_pop_hit_delta = l2_perf.pop_hit - l2_perf.pop_hit_at_last_emit;
-        uint64_t final_pop_miss_delta = l2_perf.pop_miss - l2_perf.pop_miss_at_last_emit;
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+        uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
+        uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
         debug_assert(final_pop_hit_delta < (1ULL << 32));
         debug_assert(final_pop_miss_delta < (1ULL << 32));
         if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) {
             uint64_t t_now = get_sys_cnt_aicpu();
-            l2_perf_aicpu_record_phase(
-                thread_idx, AicpuPhaseId::SCHED_DISPATCH, t_now, t_now, l2_perf.sched_loop_count, 0,
+            l2_swimlane_aicpu_record_phase(
+                thread_idx, L2SwimlaneAicpuPhaseId::SCHED_DISPATCH, t_now, t_now, l2_swimlane.sched_loop_count, 0,
                 static_cast<uint32_t>(final_pop_hit_delta), static_cast<uint32_t>(final_pop_miss_delta)
             );
-            l2_perf.pop_hit_at_last_emit = l2_perf.pop_hit;
-            l2_perf.pop_miss_at_last_emit = l2_perf.pop_miss;
+            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
+            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
         }
     }
-    log_l2_perf_summary(thread_idx, cur_thread_completed);
+    log_l2_swimlane_summary(thread_idx, cur_thread_completed);
 #endif
 
 #if PTO2_PROFILING
-    if (l2_perf.l2_perf_enabled) {
-        l2_perf_aicpu_flush_buffers(
+    if (l2_swimlane.l2_swimlane_enabled) {
+        l2_swimlane_aicpu_flush(
             thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
         );
-        if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
-            l2_perf_aicpu_flush_phase_buffers(thread_idx);
+        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
+            l2_swimlane_aicpu_flush_phase_buffers(thread_idx);
         }
     }
 #endif
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
index fd155307a..27eee8e3b 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
@@ -342,8 +342,8 @@ struct SlotTransition {
 // =============================================================================
 
 #if PTO2_PROFILING
-struct alignas(64) SchedL2PerfCounters {
-    bool l2_perf_enabled{false};
+struct alignas(64) SchedL2SwimlaneCounters {
+    bool l2_swimlane_enabled{false};
     uint64_t sched_start_ts{0};
     uint64_t sched_scan_cycle{0};
     uint64_t sched_complete_cycle{0};
@@ -371,7 +371,7 @@ struct alignas(64) SchedL2PerfCounters {
     uint64_t sched_dispatch_pop_cycle{0};
     uint64_t sched_dispatch_setup_cycle{0};
 #endif
-    void reset() { *this = SchedL2PerfCounters{}; }
+    void reset() { *this = SchedL2SwimlaneCounters{}; }
 };
 #endif
 
diff --git a/src/common/platform/onboard/host/device_runner_base.cpp b/src/common/platform/onboard/host/device_runner_base.cpp
index 6bab29993..ab9d246e5 100644
--- a/src/common/platform/onboard/host/device_runner_base.cpp
+++ b/src/common/platform/onboard/host/device_runner_base.cpp
@@ -939,7 +939,7 @@ void DeviceRunnerBase::start_shared_collectors_for_run() {
         return create_thread(std::move(fn));
     };
     if (enable_l2_swimlane_) {
-        l2_perf_collector_.start(thread_factory);
+        l2_swimlane_collector_.start(thread_factory);
     }
     if (enable_dump_tensor_) {
         dump_collector_.start(thread_factory);
@@ -958,10 +958,10 @@ void DeviceRunnerBase::teardown_shared_collectors_after_run() {
     // Diagnostic exports use the per-task `output_prefix_` directory the user
     // set on CallConfig (CallConfig::validate() enforces non-empty upstream).
     if (enable_l2_swimlane_) {
-        l2_perf_collector_.stop();
-        l2_perf_collector_.read_phase_header_metadata();
-        l2_perf_collector_.reconcile_counters();
-        l2_perf_collector_.export_swimlane_json();
+        l2_swimlane_collector_.stop();
+        l2_swimlane_collector_.read_phase_header_metadata();
+        l2_swimlane_collector_.reconcile_counters();
+        l2_swimlane_collector_.export_swimlane_json();
     }
 
     if (enable_dump_tensor_) {
diff --git a/src/common/platform/onboard/host/device_runner_base.h b/src/common/platform/onboard/host/device_runner_base.h
index 72daed69d..73e42e5b2 100644
--- a/src/common/platform/onboard/host/device_runner_base.h
+++ b/src/common/platform/onboard/host/device_runner_base.h
@@ -50,11 +50,11 @@
 
 #include "arg_direction.h"
 #include "callable.h"
-#include "common/l2_perf_profiling.h"
+#include "common/l2_swimlane_profiling.h"
 #include "device_arena.h"
 #include "device_runner_helpers.h"
 #include "host/load_aicpu_op.h"
-#include "host/l2_perf_collector.h"
+#include "host/l2_swimlane_collector.h"
 #include "host/memory_allocator.h"
 #include "host/pmu_collector.h"
 #include "host/scope_stats_collector.h"
@@ -378,8 +378,8 @@ class DeviceRunnerBase {
      * `set_dep_gen_enabled` is a2a3-only and lives on the subclass.
      */
     void set_l2_swimlane_enabled(int level) {
-        l2_perf_level_ = static_cast<L2PerfLevel>(level);
-        enable_l2_swimlane_ = (l2_perf_level_ != L2PerfLevel::DISABLED);
+        l2_swimlane_level_ = static_cast<L2SwimlaneLevel>(level);
+        enable_l2_swimlane_ = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
     }
     void set_dump_tensor_enabled(bool enable) { enable_dump_tensor_ = enable; }
     void set_pmu_enabled(int enable_pmu) {
@@ -390,7 +390,7 @@ class DeviceRunnerBase {
 
     /**
      * Directory under which all diagnostic artifacts
-     * (l2_perf_records.json / tensor_dump/ / pmu.csv) land. Required
+     * (l2_swimlane_records.json / tensor_dump/ / pmu.csv) land. Required
      * (non-empty) when any diagnostic is enabled; `CallConfig::validate()`
      * enforces this contract upstream.
      */
@@ -541,7 +541,7 @@ class DeviceRunnerBase {
 
     /**
      * Start collector mgmt + poll threads for the four shared
-     * diagnostics collectors (`l2_perf_collector_`, `dump_collector_`,
+     * diagnostics collectors (`l2_swimlane_collector_`, `dump_collector_`,
      * `pmu_collector_`, `scope_stats_collector_`) that are enabled.
      * Each `start()` is gated on the corresponding `enable_*_` flag;
      * disabled collectors are not started.
@@ -557,7 +557,7 @@ class DeviceRunnerBase {
      * Tear down the four shared diagnostics collectors after the launched
      * kernels have synced. Each block is gated on the corresponding
      * `enable_*_` flag and does: stop() → reconcile_counters() →
-     * export step (`l2_perf` writes swimlane JSON via
+     * export step (`l2_swimlane` writes swimlane JSON via
      * `read_phase_header_metadata` + `export_swimlane_json`; `dump`
      * writes dump files; `pmu` has no export step beyond reconcile;
      * `scope_stats` writes JSONL).
@@ -748,7 +748,7 @@ class DeviceRunnerBase {
     // direct `rtMalloc`/`rtFree`), but the storage and lifetime live
     // on the base. `DepGenCollector` is a2a3-only and stays on the
     // a2a3 subclass.
-    L2PerfCollector l2_perf_collector_;
+    L2SwimlaneCollector l2_swimlane_collector_;
     TensorDumpCollector dump_collector_;
     PmuCollector pmu_collector_;
     ScopeStatsCollector scope_stats_collector_;
@@ -760,9 +760,9 @@ class DeviceRunnerBase {
     bool enable_dump_tensor_{false};
     bool enable_pmu_{false};
     bool enable_scope_stats_{false};
-    L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};             // resolved from set_l2_swimlane_enabled()
-    PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};  // resolved from set_pmu_enabled()
-    std::string output_prefix_{};                                  // diagnostic artifact root directory
+    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};  // resolved from set_l2_swimlane_enabled()
+    PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};   // resolved from set_pmu_enabled()
+    std::string output_prefix_{};                                   // diagnostic artifact root directory
 };
 
 #endif  // SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_BASE_H
diff --git a/src/common/task_interface/call_config.h b/src/common/task_interface/call_config.h
index 58ca0076b..7926356dc 100644
--- a/src/common/task_interface/call_config.h
+++ b/src/common/task_interface/call_config.h
@@ -16,7 +16,7 @@
  * `enable_dump_tensor`, `enable_pmu`, `enable_dep_gen`, and
  * `enable_scope_stats`. All five require `output_prefix` because they each
  * write a sibling artifact into that directory
- * (`l2_perf_records.json` / `tensor_dump/` / `pmu.csv` / `deps.json` /
+ * (`l2_swimlane_records.json` / `tensor_dump/` / `pmu.csv` / `deps.json` /
  * `scope_stats.json`).
  *
  * `block_dim == 0` is a sentinel for "auto" — DeviceRunner resolves it at
@@ -35,7 +35,7 @@
  * across compilers (sizeof(bool) is implementation-defined).
  *
  * `output_prefix` is a NUL-terminated directory path under which all
- * diagnostic artifacts (l2_perf_records.json / tensor_dump/ / pmu.csv /
+ * diagnostic artifacts (l2_swimlane_records.json / tensor_dump/ / pmu.csv /
  * deps.json / scope_stats.json) are written. The caller is responsible for
  * filling it whenever any diagnostic flag is enabled — `validate()` enforces
  * this contract at every submit/run entry point so the runtime never has to
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py
index 428e6efbd..4c252128e 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py
@@ -20,7 +20,7 @@
   implicitly: if it broke, deps.json would be empty or wrong.
 
 deps.json is now the sole source of truth for fanout edges — the device
-hot path no longer records L2PerfRecord::fanout[], so there is no
+hot path no longer records L2SwimlaneAicpuTaskRecord::fanout[], so there is no
 "fanout ⊆ deps" cross-check to run. swimlane_converter.py joins
 deps.json into the Perfetto trace at post-process time.
 
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py
index c2b3a18e1..e12c44fcd 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py
@@ -48,12 +48,14 @@ def validate_perf_artifact(case_label: str, *, expected_task_count: int | None =
     matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime)
     if not matches:
         return
-    perf = matches[-1] / "l2_perf_records.json"
-    assert perf.exists(), f"l2_perf_records.json missing under {matches[-1]} — swimlane capture failed?"
+    perf = matches[-1] / "l2_swimlane_records.json"
+    assert perf.exists(), f"l2_swimlane_records.json missing under {matches[-1]} — swimlane capture failed?"
 
     with perf.open() as f:
         data = json.load(f)
-    assert data.get("l2_perf_level") in (1, 2, 3, 4), f"unexpected l2_perf_level: {data.get('l2_perf_level')}"
+    assert data.get("l2_swimlane_level") in (1, 2, 3, 4), (
+        f"unexpected l2_swimlane_level: {data.get('l2_swimlane_level')}"
+    )
     tasks = data.get("tasks")
     assert isinstance(tasks, list), "tasks field missing or not a list"
     assert len(tasks) > 0, f"perf records empty under {perf}"
@@ -86,7 +88,7 @@ def validate_perf_artifact(case_label: str, *, expected_task_count: int | None =
 
     # ---- Tool smoke: sched_overhead_analysis ----
     # pop_hit / pop_miss come from the dispatch-phase extras the runtime writes
-    # (l2_perf_collector.cpp). The differential block below cross-validates
+    # (l2_swimlane_collector.cpp). The differential block below cross-validates
     # the script's printed numbers against an independent oracle computed
     # straight from the raw artifacts — any regression in either the runtime
     # capture path or the parser arithmetic fails here in the same CI step
@@ -96,7 +98,7 @@ def validate_perf_artifact(case_label: str, *, expected_task_count: int | None =
             sys.executable,
             "-m",
             "simpler_setup.tools.sched_overhead_analysis",
-            "--l2-perf-records-json",
+            "--l2-swimlane-records-json",
             str(perf),
         ],
         check=True,
@@ -128,7 +130,7 @@ def verify_sched_overhead_differential(stdout: str, perf: dict, artifact_dir: Pa
 
     Args:
         stdout: captured ``sched_overhead_analysis`` stdout.
-        perf: parsed ``l2_perf_records.json`` dict — passed in by the caller
+        perf: parsed ``l2_swimlane_records.json`` dict — passed in by the caller
             so we don't re-read multi-MB profiling artifacts here.
         artifact_dir: per-case output directory. ``deps.json`` is looked up
             beside the perf JSON; absent → fanout / fanin half is skipped.
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/test_l2_swimlane.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/test_l2_swimlane.py
index ceed3eb7f..56f371b7a 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/test_l2_swimlane.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/test_l2_swimlane.py
@@ -8,7 +8,7 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
 """L2 swimlane profiling smoke — capture pipeline produces a usable
-``l2_perf_records.json``.
+``l2_swimlane_records.json``.
 
 Re-uses ``vector_example`` as a known-good 5-task AIV-only workload. When the
 ``--enable-l2-swimlane`` flag is on, the helper in :mod:`_swimlane_validate`
@@ -36,7 +36,7 @@
 
 @scene_test(level=2, runtime="tensormap_and_ringbuffer")
 class TestL2Swimlane(SceneTestCase):
-    """Vector example with --enable-l2-swimlane, then assert l2_perf_records.json."""
+    """Vector example with --enable-l2-swimlane, then assert l2_swimlane_records.json."""
 
     CALLABLE = {
         "orchestration": {