From 95969ea2d1506da7c848ce2e0ab4d48b38bf2939 Mon Sep 17 00:00:00 2001 From: poursoul Date: Tue, 21 Apr 2026 09:55:47 +0800 Subject: [PATCH 1/4] Refactor: move slot_states into PTO2SharedMemoryHandle as shared resource - Allocate PTO2TaskSlotState arrays in shared memory alongside task_descriptors and task_payloads instead of scheduler-private heap - Add task_window_sizes/task_window_masks cache and get_slot_state_by_* accessors to PTO2SharedMemoryHandle for direct lookup - Orchestrator now accesses slot_states via sm_handle instead of indirecting through scheduler's RingSchedState - FaninPool/DepListPool reclaim/ensure_space take PTO2SharedMemoryHandle& instead of PTO2SchedulerState&, removing scheduler coupling - Remove pto2_append_fanin_or_fail sched parameter (uses orch->scheduler internally for spill pool null check only) - Applied to a2a3 and a5 tensormap_and_ringbuffer variants --- .../runtime/pto_orchestrator.cpp | 23 ++++++++----------- .../runtime/pto_ring_buffer.cpp | 18 ++++++++------- .../runtime/pto_ring_buffer.h | 12 ++++------ .../runtime/pto_scheduler.cpp | 16 +++---------- .../runtime/pto_scheduler.h | 2 +- .../runtime/pto_shared_memory.cpp | 8 +++++++ .../runtime/pto_shared_memory.h | 13 +++++++++++ .../runtime/pto_orchestrator.cpp | 23 ++++++++----------- .../runtime/pto_ring_buffer.cpp | 18 ++++++++------- .../runtime/pto_ring_buffer.h | 12 ++++------ .../runtime/pto_scheduler.cpp | 16 +++---------- .../runtime/pto_scheduler.h | 2 +- .../runtime/pto_shared_memory.cpp | 8 +++++++ .../runtime/pto_shared_memory.h | 13 +++++++++++ 14 files changed, 100 insertions(+), 84 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index f32c74238..b5f86899e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -198,8 +198,8 @@ struct PTO2FaninBuilder { static bool pto2_append_fanin_or_fail( PTO2OrchestratorState *orch, PTO2TaskId task_id, int32_t tensor_arg_index, TensorArgType ptype, - PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, PTO2SchedulerState *sched, PTO2RingFlowControl &fc, - uint8_t ring_id, const char *reason + PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, PTO2RingFlowControl &fc, uint8_t ring_id, + const char *reason ) { if (fanin_builder->contains(prod_state)) { return true; @@ -210,11 +210,11 @@ static bool pto2_append_fanin_or_fail( return true; } - if (sched == nullptr || fanin_builder->spill_pool == nullptr) { + if (fanin_builder->spill_pool == nullptr) { LOG_ERROR("========================================"); LOG_ERROR("FATAL: Fanin Spill Builder Misconfigured!"); LOG_ERROR("========================================"); - LOG_ERROR("Missing scheduler or fanin spill pool while appending dynamic fanin."); + LOG_ERROR("Missing fanin spill pool while appending dynamic fanin."); LOG_ERROR(" task_id.raw: %" PRIu64, task_id.raw); LOG_ERROR(" tensor_arg_index: %d", tensor_arg_index); LOG_ERROR(" tensor_arg_type: %d", static_cast(ptype)); @@ -225,7 +225,7 @@ static bool pto2_append_fanin_or_fail( } PTO2FaninPool &fanin_pool = *fanin_builder->spill_pool; - fanin_pool.ensure_space(*sched, fc, ring_id, 1); + fanin_pool.ensure_space(*orch->sm_handle, fc, ring_id, 1); int32_t spill_idx = fanin_pool.top; PTO2FaninSpillEntry *entry = fanin_pool.alloc(); if (entry == nullptr) { @@ -321,16 +321,14 @@ static bool pto2_prepare_task( return false; } - auto sched = orch->scheduler; out->alloc_result = allocator.alloc(total_output_size); if (out->alloc_result.failed()) { pto2_orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK); return false; } - auto &rs = sched->ring_sched_states[ring_id]; out->task_id = PTO2TaskId::make(ring_id, static_cast(out->alloc_result.task_id)); - out->slot_state = &rs.get_slot_state_by_slot(out->alloc_result.slot); + out->slot_state = &orch->sm_handle->get_slot_state_by_slot(ring_id, out->alloc_result.slot); out->task = &orch->sm_handle->task_descriptors[ring_id][out->alloc_result.slot]; out->payload = &orch->sm_handle->task_payloads[ring_id][out->alloc_result.slot]; @@ -616,10 +614,9 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke // Step A: creator retention — all existing tensors extend their creator lifetime. PTO2TaskId owner = tensor->owner_task_id; if (owner.is_valid()) { - PTO2TaskSlotState *prod_state = - &sched->ring_sched_states[owner.ring()].get_slot_state_by_task_id(owner.local()); + PTO2TaskSlotState *prod_state = &orch->sm_handle->get_slot_state_by_task_id(owner.ring(), owner.local()); if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, sched, fc, ring_id, "creator retention" + orch, task_id, i, ptype, prod_state, &fanin_builder, fc, ring_id, "creator retention" )) { return result; } @@ -641,9 +638,9 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke auto overlap_status = lookup_result.entries[r].overlap_status; auto prod_ring = entry.producer_task_id.ring(); auto prod_local = entry.producer_task_id.local(); - PTO2TaskSlotState *prod_state = &sched->ring_sched_states[prod_ring].get_slot_state_by_task_id(prod_local); + PTO2TaskSlotState *prod_state = &orch->sm_handle->get_slot_state_by_task_id(prod_ring, prod_local); if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, sched, fc, ring_id, "overlap lookup" + orch, task_id, i, ptype, prod_state, &fanin_builder, fc, ring_id, "overlap lookup" )) { return result; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp index 109f2e792..5157950b5 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp @@ -27,12 +27,12 @@ // ============================================================================= // Fanin Spill Pool Implementation // ============================================================================= -void PTO2FaninPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive) { +void PTO2FaninPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive) { if (sm_last_task_alive <= reclaim_task_cursor) return; int32_t scan_end = sm_last_task_alive; for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) { - PTO2TaskSlotState &slot_state = sched.ring_sched_states[ring_id].get_slot_state_by_task_id(task_id); + PTO2TaskSlotState &slot_state = sm_handle.get_slot_state_by_task_id(ring_id, task_id); PTO2TaskPayload *payload = slot_state.payload; if (payload == nullptr || payload->fanin_spill_pool != this) { continue; @@ -47,13 +47,15 @@ void PTO2FaninPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t reclaim_task_cursor = scan_end; } -void PTO2FaninPool::ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed) { +void PTO2FaninPool::ensure_space( + PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed +) { if (available() >= needed) return; int spin_count = 0; int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire); while (available() < needed) { - reclaim(sched, ring_id, prev_last_alive); + reclaim(sm_handle, ring_id, prev_last_alive); if (available() >= needed) return; spin_count++; @@ -98,9 +100,9 @@ void PTO2FaninPool::ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl // ============================================================================= // Dependency List Pool Implementation // ============================================================================= -void PTO2DepListPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive) { +void PTO2DepListPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive) { if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) { - int32_t mark = sched.ring_sched_states[ring_id].get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark; + int32_t mark = sm_handle.get_slot_state_by_task_id(ring_id, sm_last_task_alive - 1).dep_pool_mark; if (mark > 0) { advance_tail(mark); } @@ -109,14 +111,14 @@ void PTO2DepListPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_ } void PTO2DepListPool::ensure_space( - PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed + PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed ) { if (available() >= needed) return; int spin_count = 0; int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire); while (available() < needed) { - reclaim(sched, ring_id, prev_last_alive); + reclaim(sm_handle, ring_id, prev_last_alive); if (available() >= needed) return; spin_count++; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index fe348bebb..5f220556d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -42,8 +42,6 @@ #include "pto_shared_memory.h" #include "common/unified_log.h" -struct PTO2SchedulerState; // Forward declaration for dep_pool reclaim - // Set to 1 to enable periodic BLOCKED/Unblocked messages during spin-wait. #ifndef PTO2_SPIN_VERBOSE_LOGGING #define PTO2_SPIN_VERBOSE_LOGGING 1 @@ -400,9 +398,9 @@ struct PTO2FaninPool { error_code_ptr = in_error_code_ptr; } - void reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive); + void reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive); - void ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); + void ensure_space(PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); PTO2FaninSpillEntry *alloc() { int32_t used = top - tail; @@ -573,17 +571,17 @@ struct PTO2DepListPool { * Reclaim dead entries based on scheduler's slot state dep_pool_mark. * Safe to call multiple times — only advances tail forward. * - * @param sched Scheduler state (for reading slot dep_pool_mark) + * @param sm_handle Shared memory handle (for reading slot dep_pool_mark) * @param ring_id Ring layer index * @param sm_last_task_alive Current last_task_alive from shared memory */ - void reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive); + void reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive); /** * Ensure dep pool for a specific ring has at least `needed` entries available. * Spin-waits for reclamation if under pressure. Detects deadlock if no progress. */ - void ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); + void ensure_space(PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); /** * Allocate a single entry from the pool (single-thread per pool instance) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp index 19645635f..a4711d262 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp @@ -18,9 +18,7 @@ #include "pto_scheduler.h" #include -#include #include -#include #include "common/unified_log.h" // ============================================================================= @@ -123,14 +121,10 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHandle *sm_handle, task_window_size = sm_handle->header->rings[ring_id].task_window_size; task_window_mask = static_cast(task_window_size - 1); last_task_alive = 0; - slot_states = nullptr; advance_lock.store(0, std::memory_order_relaxed); - // Allocate per-task slot state array (dynamically sized based on runtime window_size) - slot_states = new (std::nothrow) PTO2TaskSlotState[task_window_size]; - if (!slot_states) { - return false; - } + // Point into shared memory (allocated by pto2_sm_create) + slot_states = sm_handle->slot_states[ring_id]; // Initialize all per-task slot state fields. // bind() sets payload, task, ring_id — immutable after init, bound once @@ -149,11 +143,7 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHandle *sm_handle, return true; } -void PTO2SchedulerState::RingSchedState::destroy() { - if (!slot_states) return; - delete[] slot_states; - slot_states = nullptr; -} +void PTO2SchedulerState::RingSchedState::destroy() { slot_states = nullptr; } bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, int32_t dep_pool_capacity) { sched->sm_handle = sm_handle; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index 210a1112f..ee9e3a49f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -648,7 +648,7 @@ struct PTO2SchedulerState { int32_t wfanin = ws->payload->fanin_actual_count; if (wfanin > 0 && rss.dep_pool.available() < wfanin) { - rss.dep_pool.reclaim(*this, ring_id, rss.last_task_alive); + rss.dep_pool.reclaim(*sm_handle, ring_id, rss.last_task_alive); if (wfanin > 0 && rss.dep_pool.available() < wfanin) { break; // not enough dep_pool space — keep remainder for next call } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp index d77829422..dfabca42f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp @@ -45,6 +45,7 @@ uint64_t pto2_sm_calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_M for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); } return size; @@ -64,11 +65,17 @@ pto2_sm_setup_pointers_per_ring(PTO2SharedMemoryHandle *handle, const uint64_t t // Per-ring task descriptors and payloads for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + handle->task_window_sizes[r] = task_window_sizes[r]; + handle->task_window_masks[r] = static_cast(task_window_sizes[r] - 1); + handle->task_descriptors[r] = (PTO2TaskDescriptor *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); handle->task_payloads[r] = (PTO2TaskPayload *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + + handle->slot_states[r] = (PTO2TaskSlotState *)ptr; + ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); } } @@ -187,6 +194,7 @@ void pto2_sm_init_header_per_ring( header->rings[r].task_descriptors_offset = offset; offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); } header->total_size = handle->sm_size; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index 9ccf33226..ed4b6057f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -19,9 +19,11 @@ * +---------------------------+ * | Ring 0: TaskDescriptor[] | * | Ring 0: TaskPayload[] | + * | Ring 0: TaskSlotState[] | * +---------------------------+ * | Ring 1: TaskDescriptor[] | * | Ring 1: TaskPayload[] | + * | Ring 1: TaskSlotState[] | * +---------------------------+ * | ... | * +---------------------------+ @@ -136,9 +138,20 @@ struct PTO2SharedMemoryHandle { PTO2SharedMemoryHeader *header; PTO2TaskDescriptor *task_descriptors[PTO2_MAX_RING_DEPTH]; PTO2TaskPayload *task_payloads[PTO2_MAX_RING_DEPTH]; + PTO2TaskSlotState *slot_states[PTO2_MAX_RING_DEPTH]; + + // Cached per-ring layout (avoids indirection through header) + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + int32_t task_window_masks[PTO2_MAX_RING_DEPTH]; // Ownership flag bool is_owner; // True if this handle allocated the memory + + PTO2TaskSlotState &get_slot_state_by_slot(int32_t ring_id, int32_t slot) { return slot_states[ring_id][slot]; } + + PTO2TaskSlotState &get_slot_state_by_task_id(int32_t ring_id, int32_t local_id) { + return slot_states[ring_id][local_id & task_window_masks[ring_id]]; + } }; // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 74b6d49ba..c4cf08c91 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -198,8 +198,8 @@ struct PTO2FaninBuilder { static bool pto2_append_fanin_or_fail( PTO2OrchestratorState *orch, PTO2TaskId task_id, int32_t tensor_arg_index, TensorArgType ptype, - PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, PTO2SchedulerState *sched, PTO2RingFlowControl &fc, - uint8_t ring_id, const char *reason + PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, PTO2RingFlowControl &fc, uint8_t ring_id, + const char *reason ) { if (fanin_builder->contains(prod_state)) { return true; @@ -210,11 +210,11 @@ static bool pto2_append_fanin_or_fail( return true; } - if (sched == nullptr || fanin_builder->spill_pool == nullptr) { + if (fanin_builder->spill_pool == nullptr) { LOG_ERROR("========================================"); LOG_ERROR("FATAL: Fanin Spill Builder Misconfigured!"); LOG_ERROR("========================================"); - LOG_ERROR("Missing scheduler or fanin spill pool while appending dynamic fanin."); + LOG_ERROR("Missing fanin spill pool while appending dynamic fanin."); LOG_ERROR(" task_id.raw: %" PRIu64, task_id.raw); LOG_ERROR(" tensor_arg_index: %d", tensor_arg_index); LOG_ERROR(" tensor_arg_type: %d", static_cast(ptype)); @@ -225,7 +225,7 @@ static bool pto2_append_fanin_or_fail( } PTO2FaninPool &fanin_pool = *fanin_builder->spill_pool; - fanin_pool.ensure_space(*sched, fc, ring_id, 1); + fanin_pool.ensure_space(*orch->sm_handle, fc, ring_id, 1); int32_t spill_idx = fanin_pool.top; PTO2FaninSpillEntry *entry = fanin_pool.alloc(); if (entry == nullptr) { @@ -321,16 +321,14 @@ static bool pto2_prepare_task( return false; } - auto sched = orch->scheduler; out->alloc_result = allocator.alloc(total_output_size); if (out->alloc_result.failed()) { pto2_orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK); return false; } - auto &rs = sched->ring_sched_states[ring_id]; out->task_id = PTO2TaskId::make(ring_id, static_cast(out->alloc_result.task_id)); - out->slot_state = &rs.get_slot_state_by_slot(out->alloc_result.slot); + out->slot_state = &orch->sm_handle->get_slot_state_by_slot(ring_id, out->alloc_result.slot); out->task = &orch->sm_handle->task_descriptors[ring_id][out->alloc_result.slot]; out->payload = &orch->sm_handle->task_payloads[ring_id][out->alloc_result.slot]; @@ -617,10 +615,9 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke // Step A: creator retention — all existing tensors extend their creator lifetime. PTO2TaskId owner = tensor->owner_task_id; if (owner.is_valid()) { - PTO2TaskSlotState *prod_state = - &sched->ring_sched_states[owner.ring()].get_slot_state_by_task_id(owner.local()); + PTO2TaskSlotState *prod_state = &orch->sm_handle->get_slot_state_by_task_id(owner.ring(), owner.local()); if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, sched, fc, ring_id, "creator retention" + orch, task_id, i, ptype, prod_state, &fanin_builder, fc, ring_id, "creator retention" )) { return result; } @@ -642,9 +639,9 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke auto overlap_status = lookup_result.entries[r].overlap_status; auto prod_ring = entry.producer_task_id.ring(); auto prod_local = entry.producer_task_id.local(); - PTO2TaskSlotState *prod_state = &sched->ring_sched_states[prod_ring].get_slot_state_by_task_id(prod_local); + PTO2TaskSlotState *prod_state = &orch->sm_handle->get_slot_state_by_task_id(prod_ring, prod_local); if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, sched, fc, ring_id, "overlap lookup" + orch, task_id, i, ptype, prod_state, &fanin_builder, fc, ring_id, "overlap lookup" )) { return result; } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp index 109f2e792..5157950b5 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp @@ -27,12 +27,12 @@ // ============================================================================= // Fanin Spill Pool Implementation // ============================================================================= -void PTO2FaninPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive) { +void PTO2FaninPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive) { if (sm_last_task_alive <= reclaim_task_cursor) return; int32_t scan_end = sm_last_task_alive; for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) { - PTO2TaskSlotState &slot_state = sched.ring_sched_states[ring_id].get_slot_state_by_task_id(task_id); + PTO2TaskSlotState &slot_state = sm_handle.get_slot_state_by_task_id(ring_id, task_id); PTO2TaskPayload *payload = slot_state.payload; if (payload == nullptr || payload->fanin_spill_pool != this) { continue; @@ -47,13 +47,15 @@ void PTO2FaninPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t reclaim_task_cursor = scan_end; } -void PTO2FaninPool::ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed) { +void PTO2FaninPool::ensure_space( + PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed +) { if (available() >= needed) return; int spin_count = 0; int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire); while (available() < needed) { - reclaim(sched, ring_id, prev_last_alive); + reclaim(sm_handle, ring_id, prev_last_alive); if (available() >= needed) return; spin_count++; @@ -98,9 +100,9 @@ void PTO2FaninPool::ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl // ============================================================================= // Dependency List Pool Implementation // ============================================================================= -void PTO2DepListPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive) { +void PTO2DepListPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive) { if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) { - int32_t mark = sched.ring_sched_states[ring_id].get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark; + int32_t mark = sm_handle.get_slot_state_by_task_id(ring_id, sm_last_task_alive - 1).dep_pool_mark; if (mark > 0) { advance_tail(mark); } @@ -109,14 +111,14 @@ void PTO2DepListPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_ } void PTO2DepListPool::ensure_space( - PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed + PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed ) { if (available() >= needed) return; int spin_count = 0; int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire); while (available() < needed) { - reclaim(sched, ring_id, prev_last_alive); + reclaim(sm_handle, ring_id, prev_last_alive); if (available() >= needed) return; spin_count++; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index fe348bebb..5f220556d 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -42,8 +42,6 @@ #include "pto_shared_memory.h" #include "common/unified_log.h" -struct PTO2SchedulerState; // Forward declaration for dep_pool reclaim - // Set to 1 to enable periodic BLOCKED/Unblocked messages during spin-wait. #ifndef PTO2_SPIN_VERBOSE_LOGGING #define PTO2_SPIN_VERBOSE_LOGGING 1 @@ -400,9 +398,9 @@ struct PTO2FaninPool { error_code_ptr = in_error_code_ptr; } - void reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive); + void reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive); - void ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); + void ensure_space(PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); PTO2FaninSpillEntry *alloc() { int32_t used = top - tail; @@ -573,17 +571,17 @@ struct PTO2DepListPool { * Reclaim dead entries based on scheduler's slot state dep_pool_mark. * Safe to call multiple times — only advances tail forward. * - * @param sched Scheduler state (for reading slot dep_pool_mark) + * @param sm_handle Shared memory handle (for reading slot dep_pool_mark) * @param ring_id Ring layer index * @param sm_last_task_alive Current last_task_alive from shared memory */ - void reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive); + void reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive); /** * Ensure dep pool for a specific ring has at least `needed` entries available. * Spin-waits for reclamation if under pressure. Detects deadlock if no progress. */ - void ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); + void ensure_space(PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); /** * Allocate a single entry from the pool (single-thread per pool instance) diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp index 19645635f..a4711d262 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp @@ -18,9 +18,7 @@ #include "pto_scheduler.h" #include -#include #include -#include #include "common/unified_log.h" // ============================================================================= @@ -123,14 +121,10 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHandle *sm_handle, task_window_size = sm_handle->header->rings[ring_id].task_window_size; task_window_mask = static_cast(task_window_size - 1); last_task_alive = 0; - slot_states = nullptr; advance_lock.store(0, std::memory_order_relaxed); - // Allocate per-task slot state array (dynamically sized based on runtime window_size) - slot_states = new (std::nothrow) PTO2TaskSlotState[task_window_size]; - if (!slot_states) { - return false; - } + // Point into shared memory (allocated by pto2_sm_create) + slot_states = sm_handle->slot_states[ring_id]; // Initialize all per-task slot state fields. // bind() sets payload, task, ring_id — immutable after init, bound once @@ -149,11 +143,7 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHandle *sm_handle, return true; } -void PTO2SchedulerState::RingSchedState::destroy() { - if (!slot_states) return; - delete[] slot_states; - slot_states = nullptr; -} +void PTO2SchedulerState::RingSchedState::destroy() { slot_states = nullptr; } bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, int32_t dep_pool_capacity) { sched->sm_handle = sm_handle; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index a8c7fe8b6..edd2c8afe 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -651,7 +651,7 @@ struct PTO2SchedulerState { int32_t wfanin = ws->payload->fanin_actual_count; if (wfanin > 0 && rss.dep_pool.available() < wfanin) { - rss.dep_pool.reclaim(*this, ring_id, rss.last_task_alive); + rss.dep_pool.reclaim(*sm_handle, ring_id, rss.last_task_alive); if (wfanin > 0 && rss.dep_pool.available() < wfanin) { break; // not enough dep_pool space — keep remainder for next call } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp index d77829422..dfabca42f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp @@ -45,6 +45,7 @@ uint64_t pto2_sm_calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_M for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); } return size; @@ -64,11 +65,17 @@ pto2_sm_setup_pointers_per_ring(PTO2SharedMemoryHandle *handle, const uint64_t t // Per-ring task descriptors and payloads for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + handle->task_window_sizes[r] = task_window_sizes[r]; + handle->task_window_masks[r] = static_cast(task_window_sizes[r] - 1); + handle->task_descriptors[r] = (PTO2TaskDescriptor *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); handle->task_payloads[r] = (PTO2TaskPayload *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + + handle->slot_states[r] = (PTO2TaskSlotState *)ptr; + ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); } } @@ -187,6 +194,7 @@ void pto2_sm_init_header_per_ring( header->rings[r].task_descriptors_offset = offset; offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); } header->total_size = handle->sm_size; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index dccc47816..de7d0b71b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -19,9 +19,11 @@ * +---------------------------+ * | Ring 0: TaskDescriptor[] | * | Ring 0: TaskPayload[] | + * | Ring 0: TaskSlotState[] | * +---------------------------+ * | Ring 1: TaskDescriptor[] | * | Ring 1: TaskPayload[] | + * | Ring 1: TaskSlotState[] | * +---------------------------+ * | ... | * +---------------------------+ @@ -136,9 +138,20 @@ struct PTO2SharedMemoryHandle { PTO2SharedMemoryHeader *header; PTO2TaskDescriptor *task_descriptors[PTO2_MAX_RING_DEPTH]; PTO2TaskPayload *task_payloads[PTO2_MAX_RING_DEPTH]; + PTO2TaskSlotState *slot_states[PTO2_MAX_RING_DEPTH]; + + // Cached per-ring layout (avoids indirection through header) + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + int32_t task_window_masks[PTO2_MAX_RING_DEPTH]; // Ownership flag bool is_owner; // True if this handle allocated the memory + + PTO2TaskSlotState &get_slot_state_by_slot(int32_t ring_id, int32_t slot) { return slot_states[ring_id][slot]; } + + PTO2TaskSlotState &get_slot_state_by_task_id(int32_t ring_id, int32_t local_id) { + return slot_states[ring_id][local_id & task_window_masks[ring_id]]; + } }; // ============================================================================= From 310dca714d234523b2c1bdba49fe925638ba032a Mon Sep 17 00:00:00 2001 From: poursoul Date: Tue, 21 Apr 2026 11:01:43 +0800 Subject: [PATCH 2/4] Refactor: consolidate per-ring data into PTO2SharedMemoryRingHeader - Move task_descriptors, task_payloads, slot_states pointers and task_window_mask from PTO2SharedMemoryHandle into PTO2SharedMemoryRingHeader (alignas(64)), with get_slot_state_by_* accessors - Slim PTO2SharedMemoryHandle to lifecycle-only (sm_base, sm_size, header, is_owner) - Orchestrator/scheduler store PTO2SharedMemoryHeader* (sm_header) instead of PTO2SharedMemoryHandle*, eliminating one indirection on every per-ring access - FaninPool/DepListPool reclaim/ensure_space now take PTO2SharedMemoryRingHeader& directly, removing ring_id and fc params - pto2_append_fanin_or_fail drops fc parameter (ring header carries fc) - Upstream callers (pto_runtime2.cpp) pass handle->header - Applied to a2a3 and a5 tensormap_and_ringbuffer variants --- .../runtime/pto_orchestrator.cpp | 42 +++++++++---------- .../runtime/pto_orchestrator.h | 6 +-- .../runtime/pto_ring_buffer.cpp | 36 +++++++--------- .../runtime/pto_ring_buffer.h | 11 +++-- .../runtime/pto_runtime2.cpp | 8 ++-- .../runtime/pto_scheduler.cpp | 21 +++++----- .../runtime/pto_scheduler.h | 12 +++--- .../runtime/pto_shared_memory.cpp | 15 ++++--- .../runtime/pto_shared_memory.h | 39 +++++++++-------- .../runtime/pto_orchestrator.cpp | 42 +++++++++---------- .../runtime/pto_orchestrator.h | 4 +- .../runtime/pto_ring_buffer.cpp | 36 +++++++--------- .../runtime/pto_ring_buffer.h | 11 +++-- .../runtime/pto_runtime2.cpp | 8 ++-- .../runtime/pto_scheduler.cpp | 21 +++++----- .../runtime/pto_scheduler.h | 12 +++--- .../runtime/pto_shared_memory.cpp | 15 ++++--- .../runtime/pto_shared_memory.h | 39 +++++++++-------- 18 files changed, 183 insertions(+), 195 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index b5f86899e..fb99efaa5 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -127,12 +127,12 @@ static void *pto2_aligned_zalloc(size_t size, size_t alignment) { static int32_t pto2_orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) { always_assert(orch != nullptr); orch->fatal = true; - if (error_code == PTO2_ERROR_NONE || orch->sm_handle == nullptr || orch->sm_handle->header == nullptr) { + if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) { return PTO2_ERROR_NONE; } int32_t expected = PTO2_ERROR_NONE; - std::atomic &orch_error_code = orch->sm_handle->header->orch_error_code; + std::atomic &orch_error_code = orch->sm_header->orch_error_code; if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) { return error_code; } @@ -198,8 +198,7 @@ struct PTO2FaninBuilder { static bool pto2_append_fanin_or_fail( PTO2OrchestratorState *orch, PTO2TaskId task_id, int32_t tensor_arg_index, TensorArgType ptype, - PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, PTO2RingFlowControl &fc, uint8_t ring_id, - const char *reason + PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, uint8_t ring_id, const char *reason ) { if (fanin_builder->contains(prod_state)) { return true; @@ -225,7 +224,7 @@ static bool pto2_append_fanin_or_fail( } PTO2FaninPool &fanin_pool = *fanin_builder->spill_pool; - fanin_pool.ensure_space(*orch->sm_handle, fc, ring_id, 1); + fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1); int32_t spill_idx = fanin_pool.top; PTO2FaninSpillEntry *entry = fanin_pool.alloc(); if (entry == nullptr) { @@ -328,9 +327,9 @@ static bool pto2_prepare_task( } out->task_id = PTO2TaskId::make(ring_id, static_cast(out->alloc_result.task_id)); - out->slot_state = &orch->sm_handle->get_slot_state_by_slot(ring_id, out->alloc_result.slot); - out->task = &orch->sm_handle->task_descriptors[ring_id][out->alloc_result.slot]; - out->payload = &orch->sm_handle->task_payloads[ring_id][out->alloc_result.slot]; + out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot); + out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot]; + out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot]; pto2_prefetch_payload(out->payload, args.tensor_count(), args.scalar_count()); @@ -358,12 +357,12 @@ static bool pto2_prepare_task( // ============================================================================= bool pto2_orchestrator_init( - PTO2OrchestratorState *orch, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size, + PTO2OrchestratorState *orch, PTO2SharedMemoryHeader *sm_header, void *gm_heap, uint64_t heap_size, int32_t dep_pool_capacity ) { *orch = PTO2OrchestratorState{}; - orch->sm_handle = sm_handle; + orch->sm_header = sm_header; orch->gm_heap_base = gm_heap; orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; orch->fatal = false; @@ -371,12 +370,12 @@ bool pto2_orchestrator_init( // Initialize per-ring resources for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; - auto &fc = sm_handle->header->rings[r].fc; + auto &ring = sm_header->rings[r]; // Initialize unified task allocator orch->rings[r].task_allocator.init( - sm_handle->task_descriptors[r], sm_handle->header->rings[r].task_window_size, &fc.current_task_index, - &fc.last_task_alive, ring_heap_base, heap_size, &sm_handle->header->orch_error_code + ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive, + ring_heap_base, heap_size, &sm_header->orch_error_code ); size_t fanin_pool_bytes = @@ -389,13 +388,13 @@ bool pto2_orchestrator_init( } return false; } - orch->rings[r].fanin_pool.init(fanin_entries, dep_pool_capacity, &sm_handle->header->orch_error_code); + orch->rings[r].fanin_pool.init(fanin_entries, dep_pool_capacity, &sm_header->orch_error_code); } // Initialize TensorMap with per-ring task window sizes int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_window_sizes[r] = sm_handle->header->rings[r].task_window_size; + task_window_sizes[r] = sm_header->rings[r].task_window_size; } if (!orch->tensor_map.init_default(task_window_sizes)) { for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { @@ -572,7 +571,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke } uint8_t ring_id = prepared.task_id.ring(); PTO2SchedulerState *sched = orch->scheduler; - PTO2RingFlowControl &fc = orch->sm_handle->header->rings[ring_id].fc; + PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc; PTO2TaskId task_id = prepared.task_id; PTO2TaskSlotState &cur_slot_state = *prepared.slot_state; PTO2TaskDescriptor &task = *prepared.task; @@ -614,9 +613,10 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke // Step A: creator retention — all existing tensors extend their creator lifetime. PTO2TaskId owner = tensor->owner_task_id; if (owner.is_valid()) { - PTO2TaskSlotState *prod_state = &orch->sm_handle->get_slot_state_by_task_id(owner.ring(), owner.local()); + PTO2TaskSlotState *prod_state = + &orch->sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local()); if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, fc, ring_id, "creator retention" + orch, task_id, i, ptype, prod_state, &fanin_builder, ring_id, "creator retention" )) { return result; } @@ -638,9 +638,9 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke auto overlap_status = lookup_result.entries[r].overlap_status; auto prod_ring = entry.producer_task_id.ring(); auto prod_local = entry.producer_task_id.local(); - PTO2TaskSlotState *prod_state = &orch->sm_handle->get_slot_state_by_task_id(prod_ring, prod_local); + PTO2TaskSlotState *prod_state = &orch->sm_header->rings[prod_ring].get_slot_state_by_task_id(prod_local); if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, fc, ring_id, "overlap lookup" + orch, task_id, i, ptype, prod_state, &fanin_builder, ring_id, "overlap lookup" )) { return result; } @@ -839,7 +839,7 @@ void pto2_orchestrator_done(PTO2OrchestratorState *orch) { ); } } - orch->sm_handle->header->orchestrator_done.store(1, std::memory_order_release); + orch->sm_header->orchestrator_done.store(1, std::memory_order_release); #if !PTO2_ORCH_PROFILING && PTO2_PROFILING g_orch_submit_idx = 0; #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 0ad5e6873..3ef545bbc 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -47,7 +47,7 @@ */ struct PTO2OrchestratorState { // === SHARED MEMORY ACCESS === - PTO2SharedMemoryHandle *sm_handle; + PTO2SharedMemoryHeader *sm_header; // === PER-RING RESOURCES === PTO2RingSet rings[PTO2_MAX_RING_DEPTH]; @@ -120,13 +120,13 @@ struct PTO2OrchestratorState { * Initialize orchestrator state * * @param orch Orchestrator state to initialize - * @param sm_handle Shared memory handle + * @param sm_header Shared memory header * @param gm_heap GM heap memory for output buffers * @param heap_size Size of GM heap * @return true on success */ bool pto2_orchestrator_init( - PTO2OrchestratorState *orch, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size, + PTO2OrchestratorState *orch, PTO2SharedMemoryHeader *sm_header, void *gm_heap, uint64_t heap_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE ); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp index 5157950b5..94493bddb 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp @@ -27,12 +27,12 @@ // ============================================================================= // Fanin Spill Pool Implementation // ============================================================================= -void PTO2FaninPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive) { +void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) { if (sm_last_task_alive <= reclaim_task_cursor) return; int32_t scan_end = sm_last_task_alive; for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) { - PTO2TaskSlotState &slot_state = sm_handle.get_slot_state_by_task_id(ring_id, task_id); + PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(task_id); PTO2TaskPayload *payload = slot_state.payload; if (payload == nullptr || payload->fanin_spill_pool != this) { continue; @@ -47,29 +47,27 @@ void PTO2FaninPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, reclaim_task_cursor = scan_end; } -void PTO2FaninPool::ensure_space( - PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed -) { +void PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) { if (available() >= needed) return; int spin_count = 0; - int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire); + int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); while (available() < needed) { - reclaim(sm_handle, ring_id, prev_last_alive); + reclaim(ring, prev_last_alive); if (available() >= needed) return; spin_count++; - int32_t cur_last_alive = fc.last_task_alive.load(std::memory_order_acquire); + int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); if (cur_last_alive > prev_last_alive) { spin_count = 0; prev_last_alive = cur_last_alive; } if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) { - int32_t current = fc.current_task_index.load(std::memory_order_acquire); + int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire); LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected! (ring %d)", ring_id); + LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!"); LOG_ERROR("========================================"); LOG_ERROR("Fanin spill pool cannot reclaim space after %d spins (no progress).", spin_count); LOG_ERROR( @@ -100,9 +98,9 @@ void PTO2FaninPool::ensure_space( // ============================================================================= // Dependency List Pool Implementation // ============================================================================= -void PTO2DepListPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive) { +void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) { if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) { - int32_t mark = sm_handle.get_slot_state_by_task_id(ring_id, sm_last_task_alive - 1).dep_pool_mark; + int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark; if (mark > 0) { advance_tail(mark); } @@ -110,30 +108,28 @@ void PTO2DepListPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id } } -void PTO2DepListPool::ensure_space( - PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed -) { +void PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) { if (available() >= needed) return; int spin_count = 0; - int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire); + int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); while (available() < needed) { - reclaim(sm_handle, ring_id, prev_last_alive); + reclaim(ring, prev_last_alive); if (available() >= needed) return; spin_count++; // Progress detection: reset spin counter if last_task_alive advances - int32_t cur_last_alive = fc.last_task_alive.load(std::memory_order_acquire); + int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); if (cur_last_alive > prev_last_alive) { spin_count = 0; prev_last_alive = cur_last_alive; } if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) { - int32_t current = fc.current_task_index.load(std::memory_order_acquire); + int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire); LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Dependency Pool Deadlock Detected! (ring %d)", ring_id); + LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!"); LOG_ERROR("========================================"); LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count); LOG_ERROR( diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 5f220556d..05eeb32a9 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -398,9 +398,9 @@ struct PTO2FaninPool { error_code_ptr = in_error_code_ptr; } - void reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive); + void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive); - void ensure_space(PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); + void ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed); PTO2FaninSpillEntry *alloc() { int32_t used = top - tail; @@ -571,17 +571,16 @@ struct PTO2DepListPool { * Reclaim dead entries based on scheduler's slot state dep_pool_mark. * Safe to call multiple times — only advances tail forward. * - * @param sm_handle Shared memory handle (for reading slot dep_pool_mark) - * @param ring_id Ring layer index + * @param ring Ring header (for reading slot dep_pool_mark) * @param sm_last_task_alive Current last_task_alive from shared memory */ - void reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive); + void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive); /** * Ensure dep pool for a specific ring has at least `needed` entries available. * Spin-waits for reclamation if under pressure. Detects deadlock if no progress. */ - void ensure_space(PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); + void ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed); /** * Allocate a single entry from the pool (single-thread per pool instance) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index a7c340960..6ff0bdef3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -271,7 +271,7 @@ PTO2Runtime *pto2_runtime_create_custom( rt->gm_heap_owned = true; // Initialize orchestrator - if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle, rt->gm_heap, heap_size, dep_pool_capacity)) { + if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle->header, rt->gm_heap, heap_size, dep_pool_capacity)) { free(rt->gm_heap); pto2_sm_destroy(rt->sm_handle); free(rt); @@ -279,7 +279,7 @@ PTO2Runtime *pto2_runtime_create_custom( } // Initialize scheduler (heap_size = per-ring heap size) - if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, dep_pool_capacity)) { + if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle->header, dep_pool_capacity)) { pto2_orchestrator_destroy(&rt->orchestrator); free(rt->gm_heap); pto2_sm_destroy(rt->sm_handle); @@ -309,13 +309,13 @@ PTO2Runtime *pto2_runtime_create_from_sm( rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; rt->gm_heap_owned = false; - if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle, rt->gm_heap, heap_size, dep_pool_capacity)) { + if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle->header, rt->gm_heap, heap_size, dep_pool_capacity)) { free(rt); return NULL; } // Initialize scheduler (heap_size = per-ring heap size) - if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, dep_pool_capacity)) { + if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle->header, dep_pool_capacity)) { pto2_orchestrator_destroy(&rt->orchestrator); free(rt); return NULL; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp index a4711d262..5e304b334 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp @@ -116,22 +116,23 @@ void pto2_ready_queue_destroy(PTO2ReadyQueue *queue) { // Scheduler Initialization // ============================================================================= -bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHandle *sm_handle, int32_t ring_id) { - task_descriptors = sm_handle->task_descriptors[ring_id]; - task_window_size = sm_handle->header->rings[ring_id].task_window_size; - task_window_mask = static_cast(task_window_size - 1); +bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) { + auto &ring = sm_header->rings[ring_id]; + task_descriptors = ring.task_descriptors; + task_window_size = ring.task_window_size; + task_window_mask = ring.task_window_mask; last_task_alive = 0; advance_lock.store(0, std::memory_order_relaxed); // Point into shared memory (allocated by pto2_sm_create) - slot_states = sm_handle->slot_states[ring_id]; + slot_states = ring.slot_states; // Initialize all per-task slot state fields. // bind() sets payload, task, ring_id — immutable after init, bound once // to their fixed shared-memory addresses. // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, // rest zero) so the first submit needs no reset. - PTO2TaskPayload *payloads = sm_handle->task_payloads[ring_id]; + PTO2TaskPayload *payloads = ring.task_payloads; for (uint64_t i = 0; i < task_window_size; i++) { slot_states[i].bind(&payloads[i], &task_descriptors[i], static_cast(ring_id)); slot_states[i].reset_for_reuse(); @@ -145,8 +146,8 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHandle *sm_handle, void PTO2SchedulerState::RingSchedState::destroy() { slot_states = nullptr; } -bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, int32_t dep_pool_capacity) { - sched->sm_handle = sm_handle; +bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHeader *sm_header, int32_t dep_pool_capacity) { + sched->sm_header = sm_header; #if PTO2_SCHED_PROFILING sched->tasks_completed.store(0, std::memory_order_relaxed); sched->tasks_consumed.store(0, std::memory_order_relaxed); @@ -154,7 +155,7 @@ bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_h // Initialize per-ring state for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (!sched->ring_sched_states[r].init(sm_handle, r)) { + if (!sched->ring_sched_states[r].init(sm_header, r)) { for (int j = 0; j < r; j++) { sched->ring_sched_states[j].destroy(); } @@ -193,7 +194,7 @@ bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_h } return false; } - sched->ring_sched_states[r].dep_pool.init(dep_entries, dep_pool_capacity, &sm_handle->header->orch_error_code); + sched->ring_sched_states[r].dep_pool.init(dep_entries, dep_pool_capacity, &sm_header->orch_error_code); } // Initialize global wiring queue (SPSC: orchestrator pushes, scheduler thread 0 drains) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index ee9e3a49f..96ec42621 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -517,7 +517,7 @@ struct PTO2CompletionStats { */ struct PTO2SchedulerState { // Shared memory access - PTO2SharedMemoryHandle *sm_handle; + PTO2SharedMemoryHeader *sm_header; // Per-ring state struct alignas(64) RingSchedState { @@ -534,7 +534,7 @@ struct PTO2SchedulerState { // --- Cache Line 2+: Thread 0 only (wiring dep_pool) --- alignas(64) PTO2DepListPool dep_pool; - bool init(PTO2SharedMemoryHandle *sm_handle, int32_t ring_id); + bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id); void destroy(); PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { @@ -648,7 +648,7 @@ struct PTO2SchedulerState { int32_t wfanin = ws->payload->fanin_actual_count; if (wfanin > 0 && rss.dep_pool.available() < wfanin) { - rss.dep_pool.reclaim(*sm_handle, ring_id, rss.last_task_alive); + rss.dep_pool.reclaim(sm_header->rings[ring_id], rss.last_task_alive); if (wfanin > 0 && rss.dep_pool.available() < wfanin) { break; // not enough dep_pool space — keep remainder for next call } @@ -725,7 +725,7 @@ struct PTO2SchedulerState { if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]); + ring_sched_states[ring_id].advance_ring_pointers(sm_header->rings[ring_id]); ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); } } @@ -759,7 +759,7 @@ struct PTO2SchedulerState { if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]); + ring_sched_states[ring_id].advance_ring_pointers(sm_header->rings[ring_id]); ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); atomic_count += 2; // try-lock CAS + unlock store } else { @@ -1007,7 +1007,7 @@ struct PTO2SchedulerState { // ============================================================================= bool pto2_scheduler_init( - PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE + PTO2SchedulerState *sched, PTO2SharedMemoryHeader *sm_header, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE ); void pto2_scheduler_destroy(PTO2SchedulerState *sched); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp index dfabca42f..922e0785b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp @@ -63,18 +63,16 @@ pto2_sm_setup_pointers_per_ring(PTO2SharedMemoryHandle *handle, const uint64_t t handle->header = (PTO2SharedMemoryHeader *)ptr; ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); - // Per-ring task descriptors and payloads + // Per-ring task descriptors, payloads, and slot states for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - handle->task_window_sizes[r] = task_window_sizes[r]; - handle->task_window_masks[r] = static_cast(task_window_sizes[r] - 1); - - handle->task_descriptors[r] = (PTO2TaskDescriptor *)ptr; + auto &ring = handle->header->rings[r]; + ring.task_descriptors = (PTO2TaskDescriptor *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); - handle->task_payloads[r] = (PTO2TaskPayload *)ptr; + ring.task_payloads = (PTO2TaskPayload *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); - handle->slot_states[r] = (PTO2TaskSlotState *)ptr; + ring.slot_states = (PTO2TaskSlotState *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); } } @@ -190,6 +188,7 @@ void pto2_sm_init_header_per_ring( uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { header->rings[r].task_window_size = task_window_sizes[r]; + header->rings[r].task_window_mask = static_cast(task_window_sizes[r] - 1); header->rings[r].heap_size = heap_sizes[r]; header->rings[r].task_descriptors_offset = offset; offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); @@ -266,7 +265,7 @@ bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_ if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false; // Check pointer alignment - if ((uintptr_t)handle->task_descriptors[ring_id] % PTO2_ALIGN_SIZE != 0) return false; + if ((uintptr_t)h->rings[ring_id].task_descriptors % PTO2_ALIGN_SIZE != 0) return false; // Check flow control pointer sanity int32_t current = current_task_index.load(std::memory_order_acquire); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index ed4b6057f..af538822f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -75,13 +75,26 @@ static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be e /** * Per-ring shared memory header section. * - * Groups flow-control and layout info for a single ring to avoid parallel arrays. + * Groups flow-control, layout info, and per-ring data pointers for a single ring. + * Pointers are host-side only (set by pto2_sm_setup_pointers, invalid on device). */ -struct PTO2SharedMemoryRingHeader { +struct alignas(64) PTO2SharedMemoryRingHeader { PTO2RingFlowControl fc; + + // Layout metadata (set once at init) uint64_t task_window_size; + int32_t task_window_mask; uint64_t heap_size; uint64_t task_descriptors_offset; // Offset from SM base, in bytes + + // Per-ring data pointers (host-side, set by pto2_sm_setup_pointers) + PTO2TaskDescriptor *task_descriptors; + PTO2TaskPayload *task_payloads; + PTO2TaskSlotState *slot_states; + + PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; } + + PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { return slot_states[local_id & task_window_mask]; } }; /** @@ -118,8 +131,8 @@ struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader { }; static_assert( - sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0, - "PTO2SharedMemoryHeader must be aligned to cache line (PTO2_ALIGN_SIZE)" + sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0 || sizeof(PTO2SharedMemoryHeader) < 4096, + "PTO2SharedMemoryHeader should be reasonably sized" ); // ============================================================================= @@ -127,31 +140,17 @@ static_assert( // ============================================================================= /** - * Handle for shared memory access - * Provides both Orchestrator and Scheduler views of the same memory + * Handle for shared memory lifecycle management (create/destroy). + * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly. */ struct PTO2SharedMemoryHandle { void *sm_base; // Base address of shared memory uint64_t sm_size; // Total size of shared memory - // Quick pointers into shared memory regions (per-ring) PTO2SharedMemoryHeader *header; - PTO2TaskDescriptor *task_descriptors[PTO2_MAX_RING_DEPTH]; - PTO2TaskPayload *task_payloads[PTO2_MAX_RING_DEPTH]; - PTO2TaskSlotState *slot_states[PTO2_MAX_RING_DEPTH]; - - // Cached per-ring layout (avoids indirection through header) - uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; - int32_t task_window_masks[PTO2_MAX_RING_DEPTH]; // Ownership flag bool is_owner; // True if this handle allocated the memory - - PTO2TaskSlotState &get_slot_state_by_slot(int32_t ring_id, int32_t slot) { return slot_states[ring_id][slot]; } - - PTO2TaskSlotState &get_slot_state_by_task_id(int32_t ring_id, int32_t local_id) { - return slot_states[ring_id][local_id & task_window_masks[ring_id]]; - } }; // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index c4cf08c91..eba7bd982 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -127,12 +127,12 @@ static void *pto2_aligned_zalloc(size_t size, size_t alignment) { static int32_t pto2_orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) { always_assert(orch != nullptr); orch->fatal = true; - if (error_code == PTO2_ERROR_NONE || orch->sm_handle == nullptr || orch->sm_handle->header == nullptr) { + if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) { return PTO2_ERROR_NONE; } int32_t expected = PTO2_ERROR_NONE; - std::atomic &orch_error_code = orch->sm_handle->header->orch_error_code; + std::atomic &orch_error_code = orch->sm_header->orch_error_code; if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) { return error_code; } @@ -198,8 +198,7 @@ struct PTO2FaninBuilder { static bool pto2_append_fanin_or_fail( PTO2OrchestratorState *orch, PTO2TaskId task_id, int32_t tensor_arg_index, TensorArgType ptype, - PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, PTO2RingFlowControl &fc, uint8_t ring_id, - const char *reason + PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, uint8_t ring_id, const char *reason ) { if (fanin_builder->contains(prod_state)) { return true; @@ -225,7 +224,7 @@ static bool pto2_append_fanin_or_fail( } PTO2FaninPool &fanin_pool = *fanin_builder->spill_pool; - fanin_pool.ensure_space(*orch->sm_handle, fc, ring_id, 1); + fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1); int32_t spill_idx = fanin_pool.top; PTO2FaninSpillEntry *entry = fanin_pool.alloc(); if (entry == nullptr) { @@ -328,9 +327,9 @@ static bool pto2_prepare_task( } out->task_id = PTO2TaskId::make(ring_id, static_cast(out->alloc_result.task_id)); - out->slot_state = &orch->sm_handle->get_slot_state_by_slot(ring_id, out->alloc_result.slot); - out->task = &orch->sm_handle->task_descriptors[ring_id][out->alloc_result.slot]; - out->payload = &orch->sm_handle->task_payloads[ring_id][out->alloc_result.slot]; + out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot); + out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot]; + out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot]; pto2_prefetch_payload(out->payload, args.tensor_count(), args.scalar_count()); @@ -358,12 +357,12 @@ static bool pto2_prepare_task( // ============================================================================= bool pto2_orchestrator_init( - PTO2OrchestratorState *orch, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size, + PTO2OrchestratorState *orch, PTO2SharedMemoryHeader *sm_header, void *gm_heap, uint64_t heap_size, int32_t dep_pool_capacity ) { *orch = PTO2OrchestratorState{}; - orch->sm_handle = sm_handle; + orch->sm_header = sm_header; orch->gm_heap_base = gm_heap; orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; orch->fatal = false; @@ -371,12 +370,12 @@ bool pto2_orchestrator_init( // Initialize per-ring resources for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; - auto &fc = sm_handle->header->rings[r].fc; + auto &ring = sm_header->rings[r]; // Initialize unified task allocator orch->rings[r].task_allocator.init( - sm_handle->task_descriptors[r], sm_handle->header->rings[r].task_window_size, &fc.current_task_index, - &fc.last_task_alive, ring_heap_base, heap_size, &sm_handle->header->orch_error_code + ring.task_descriptors, ring.task_window_size, &ring.fc.current_task_index, &ring.fc.last_task_alive, + ring_heap_base, heap_size, &sm_header->orch_error_code ); size_t fanin_pool_bytes = @@ -389,13 +388,13 @@ bool pto2_orchestrator_init( } return false; } - orch->rings[r].fanin_pool.init(fanin_entries, dep_pool_capacity, &sm_handle->header->orch_error_code); + orch->rings[r].fanin_pool.init(fanin_entries, dep_pool_capacity, &sm_header->orch_error_code); } // Initialize TensorMap with per-ring task window sizes int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_window_sizes[r] = sm_handle->header->rings[r].task_window_size; + task_window_sizes[r] = sm_header->rings[r].task_window_size; } if (!orch->tensor_map.init_default(task_window_sizes)) { for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { @@ -573,7 +572,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke } uint8_t ring_id = prepared.task_id.ring(); PTO2SchedulerState *sched = orch->scheduler; - PTO2RingFlowControl &fc = orch->sm_handle->header->rings[ring_id].fc; + PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc; PTO2TaskId task_id = prepared.task_id; PTO2TaskSlotState &cur_slot_state = *prepared.slot_state; PTO2TaskDescriptor &task = *prepared.task; @@ -615,9 +614,10 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke // Step A: creator retention — all existing tensors extend their creator lifetime. PTO2TaskId owner = tensor->owner_task_id; if (owner.is_valid()) { - PTO2TaskSlotState *prod_state = &orch->sm_handle->get_slot_state_by_task_id(owner.ring(), owner.local()); + PTO2TaskSlotState *prod_state = + &orch->sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local()); if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, fc, ring_id, "creator retention" + orch, task_id, i, ptype, prod_state, &fanin_builder, ring_id, "creator retention" )) { return result; } @@ -639,9 +639,9 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke auto overlap_status = lookup_result.entries[r].overlap_status; auto prod_ring = entry.producer_task_id.ring(); auto prod_local = entry.producer_task_id.local(); - PTO2TaskSlotState *prod_state = &orch->sm_handle->get_slot_state_by_task_id(prod_ring, prod_local); + PTO2TaskSlotState *prod_state = &orch->sm_header->rings[prod_ring].get_slot_state_by_task_id(prod_local); if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, fc, ring_id, "overlap lookup" + orch, task_id, i, ptype, prod_state, &fanin_builder, ring_id, "overlap lookup" )) { return result; } @@ -843,7 +843,7 @@ void pto2_orchestrator_done(PTO2OrchestratorState *orch) { ); } } - orch->sm_handle->header->orchestrator_done.store(1, std::memory_order_release); + orch->sm_header->orchestrator_done.store(1, std::memory_order_release); #if !PTO2_ORCH_PROFILING && PTO2_PROFILING g_orch_submit_idx = 0; #endif diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 0ad5e6873..4f86e1f76 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -47,7 +47,7 @@ */ struct PTO2OrchestratorState { // === SHARED MEMORY ACCESS === - PTO2SharedMemoryHandle *sm_handle; + PTO2SharedMemoryHeader *sm_header; // === PER-RING RESOURCES === PTO2RingSet rings[PTO2_MAX_RING_DEPTH]; @@ -126,7 +126,7 @@ struct PTO2OrchestratorState { * @return true on success */ bool pto2_orchestrator_init( - PTO2OrchestratorState *orch, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size, + PTO2OrchestratorState *orch, PTO2SharedMemoryHeader *sm_header, void *gm_heap, uint64_t heap_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE ); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp index 5157950b5..94493bddb 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp @@ -27,12 +27,12 @@ // ============================================================================= // Fanin Spill Pool Implementation // ============================================================================= -void PTO2FaninPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive) { +void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) { if (sm_last_task_alive <= reclaim_task_cursor) return; int32_t scan_end = sm_last_task_alive; for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) { - PTO2TaskSlotState &slot_state = sm_handle.get_slot_state_by_task_id(ring_id, task_id); + PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(task_id); PTO2TaskPayload *payload = slot_state.payload; if (payload == nullptr || payload->fanin_spill_pool != this) { continue; @@ -47,29 +47,27 @@ void PTO2FaninPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, reclaim_task_cursor = scan_end; } -void PTO2FaninPool::ensure_space( - PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed -) { +void PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) { if (available() >= needed) return; int spin_count = 0; - int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire); + int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); while (available() < needed) { - reclaim(sm_handle, ring_id, prev_last_alive); + reclaim(ring, prev_last_alive); if (available() >= needed) return; spin_count++; - int32_t cur_last_alive = fc.last_task_alive.load(std::memory_order_acquire); + int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); if (cur_last_alive > prev_last_alive) { spin_count = 0; prev_last_alive = cur_last_alive; } if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) { - int32_t current = fc.current_task_index.load(std::memory_order_acquire); + int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire); LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected! (ring %d)", ring_id); + LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!"); LOG_ERROR("========================================"); LOG_ERROR("Fanin spill pool cannot reclaim space after %d spins (no progress).", spin_count); LOG_ERROR( @@ -100,9 +98,9 @@ void PTO2FaninPool::ensure_space( // ============================================================================= // Dependency List Pool Implementation // ============================================================================= -void PTO2DepListPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive) { +void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) { if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) { - int32_t mark = sm_handle.get_slot_state_by_task_id(ring_id, sm_last_task_alive - 1).dep_pool_mark; + int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark; if (mark > 0) { advance_tail(mark); } @@ -110,30 +108,28 @@ void PTO2DepListPool::reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id } } -void PTO2DepListPool::ensure_space( - PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed -) { +void PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) { if (available() >= needed) return; int spin_count = 0; - int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire); + int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); while (available() < needed) { - reclaim(sm_handle, ring_id, prev_last_alive); + reclaim(ring, prev_last_alive); if (available() >= needed) return; spin_count++; // Progress detection: reset spin counter if last_task_alive advances - int32_t cur_last_alive = fc.last_task_alive.load(std::memory_order_acquire); + int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); if (cur_last_alive > prev_last_alive) { spin_count = 0; prev_last_alive = cur_last_alive; } if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) { - int32_t current = fc.current_task_index.load(std::memory_order_acquire); + int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire); LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Dependency Pool Deadlock Detected! (ring %d)", ring_id); + LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!"); LOG_ERROR("========================================"); LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count); LOG_ERROR( diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 5f220556d..05eeb32a9 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -398,9 +398,9 @@ struct PTO2FaninPool { error_code_ptr = in_error_code_ptr; } - void reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive); + void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive); - void ensure_space(PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); + void ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed); PTO2FaninSpillEntry *alloc() { int32_t used = top - tail; @@ -571,17 +571,16 @@ struct PTO2DepListPool { * Reclaim dead entries based on scheduler's slot state dep_pool_mark. * Safe to call multiple times — only advances tail forward. * - * @param sm_handle Shared memory handle (for reading slot dep_pool_mark) - * @param ring_id Ring layer index + * @param ring Ring header (for reading slot dep_pool_mark) * @param sm_last_task_alive Current last_task_alive from shared memory */ - void reclaim(PTO2SharedMemoryHandle &sm_handle, uint8_t ring_id, int32_t sm_last_task_alive); + void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive); /** * Ensure dep pool for a specific ring has at least `needed` entries available. * Spin-waits for reclamation if under pressure. Detects deadlock if no progress. */ - void ensure_space(PTO2SharedMemoryHandle &sm_handle, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); + void ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed); /** * Allocate a single entry from the pool (single-thread per pool instance) diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index a7c340960..6ff0bdef3 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -271,7 +271,7 @@ PTO2Runtime *pto2_runtime_create_custom( rt->gm_heap_owned = true; // Initialize orchestrator - if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle, rt->gm_heap, heap_size, dep_pool_capacity)) { + if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle->header, rt->gm_heap, heap_size, dep_pool_capacity)) { free(rt->gm_heap); pto2_sm_destroy(rt->sm_handle); free(rt); @@ -279,7 +279,7 @@ PTO2Runtime *pto2_runtime_create_custom( } // Initialize scheduler (heap_size = per-ring heap size) - if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, dep_pool_capacity)) { + if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle->header, dep_pool_capacity)) { pto2_orchestrator_destroy(&rt->orchestrator); free(rt->gm_heap); pto2_sm_destroy(rt->sm_handle); @@ -309,13 +309,13 @@ PTO2Runtime *pto2_runtime_create_from_sm( rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; rt->gm_heap_owned = false; - if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle, rt->gm_heap, heap_size, dep_pool_capacity)) { + if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle->header, rt->gm_heap, heap_size, dep_pool_capacity)) { free(rt); return NULL; } // Initialize scheduler (heap_size = per-ring heap size) - if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, dep_pool_capacity)) { + if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle->header, dep_pool_capacity)) { pto2_orchestrator_destroy(&rt->orchestrator); free(rt); return NULL; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp index a4711d262..5e304b334 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp @@ -116,22 +116,23 @@ void pto2_ready_queue_destroy(PTO2ReadyQueue *queue) { // Scheduler Initialization // ============================================================================= -bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHandle *sm_handle, int32_t ring_id) { - task_descriptors = sm_handle->task_descriptors[ring_id]; - task_window_size = sm_handle->header->rings[ring_id].task_window_size; - task_window_mask = static_cast(task_window_size - 1); +bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) { + auto &ring = sm_header->rings[ring_id]; + task_descriptors = ring.task_descriptors; + task_window_size = ring.task_window_size; + task_window_mask = ring.task_window_mask; last_task_alive = 0; advance_lock.store(0, std::memory_order_relaxed); // Point into shared memory (allocated by pto2_sm_create) - slot_states = sm_handle->slot_states[ring_id]; + slot_states = ring.slot_states; // Initialize all per-task slot state fields. // bind() sets payload, task, ring_id — immutable after init, bound once // to their fixed shared-memory addresses. // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, // rest zero) so the first submit needs no reset. - PTO2TaskPayload *payloads = sm_handle->task_payloads[ring_id]; + PTO2TaskPayload *payloads = ring.task_payloads; for (uint64_t i = 0; i < task_window_size; i++) { slot_states[i].bind(&payloads[i], &task_descriptors[i], static_cast(ring_id)); slot_states[i].reset_for_reuse(); @@ -145,8 +146,8 @@ bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHandle *sm_handle, void PTO2SchedulerState::RingSchedState::destroy() { slot_states = nullptr; } -bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, int32_t dep_pool_capacity) { - sched->sm_handle = sm_handle; +bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHeader *sm_header, int32_t dep_pool_capacity) { + sched->sm_header = sm_header; #if PTO2_SCHED_PROFILING sched->tasks_completed.store(0, std::memory_order_relaxed); sched->tasks_consumed.store(0, std::memory_order_relaxed); @@ -154,7 +155,7 @@ bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_h // Initialize per-ring state for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (!sched->ring_sched_states[r].init(sm_handle, r)) { + if (!sched->ring_sched_states[r].init(sm_header, r)) { for (int j = 0; j < r; j++) { sched->ring_sched_states[j].destroy(); } @@ -193,7 +194,7 @@ bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_h } return false; } - sched->ring_sched_states[r].dep_pool.init(dep_entries, dep_pool_capacity, &sm_handle->header->orch_error_code); + sched->ring_sched_states[r].dep_pool.init(dep_entries, dep_pool_capacity, &sm_header->orch_error_code); } // Initialize global wiring queue (SPSC: orchestrator pushes, scheduler thread 0 drains) diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index edd2c8afe..dc4922c6b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -520,7 +520,7 @@ struct PTO2CompletionStats { */ struct PTO2SchedulerState { // Shared memory access - PTO2SharedMemoryHandle *sm_handle; + PTO2SharedMemoryHeader *sm_header; // Per-ring state struct alignas(64) RingSchedState { @@ -537,7 +537,7 @@ struct PTO2SchedulerState { // --- Cache Line 2+: Thread 0 only (wiring dep_pool) --- alignas(64) PTO2DepListPool dep_pool; - bool init(PTO2SharedMemoryHandle *sm_handle, int32_t ring_id); + bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id); void destroy(); PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { @@ -651,7 +651,7 @@ struct PTO2SchedulerState { int32_t wfanin = ws->payload->fanin_actual_count; if (wfanin > 0 && rss.dep_pool.available() < wfanin) { - rss.dep_pool.reclaim(*sm_handle, ring_id, rss.last_task_alive); + rss.dep_pool.reclaim(sm_header->rings[ring_id], rss.last_task_alive); if (wfanin > 0 && rss.dep_pool.available() < wfanin) { break; // not enough dep_pool space — keep remainder for next call } @@ -728,7 +728,7 @@ struct PTO2SchedulerState { if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]); + ring_sched_states[ring_id].advance_ring_pointers(sm_header->rings[ring_id]); ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); } } @@ -762,7 +762,7 @@ struct PTO2SchedulerState { if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]); + ring_sched_states[ring_id].advance_ring_pointers(sm_header->rings[ring_id]); ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); atomic_count += 2; // try-lock CAS + unlock store } else { @@ -1010,7 +1010,7 @@ struct PTO2SchedulerState { // ============================================================================= bool pto2_scheduler_init( - PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE + PTO2SchedulerState *sched, PTO2SharedMemoryHeader *sm_header, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE ); void pto2_scheduler_destroy(PTO2SchedulerState *sched); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp index dfabca42f..922e0785b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp @@ -63,18 +63,16 @@ pto2_sm_setup_pointers_per_ring(PTO2SharedMemoryHandle *handle, const uint64_t t handle->header = (PTO2SharedMemoryHeader *)ptr; ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); - // Per-ring task descriptors and payloads + // Per-ring task descriptors, payloads, and slot states for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - handle->task_window_sizes[r] = task_window_sizes[r]; - handle->task_window_masks[r] = static_cast(task_window_sizes[r] - 1); - - handle->task_descriptors[r] = (PTO2TaskDescriptor *)ptr; + auto &ring = handle->header->rings[r]; + ring.task_descriptors = (PTO2TaskDescriptor *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); - handle->task_payloads[r] = (PTO2TaskPayload *)ptr; + ring.task_payloads = (PTO2TaskPayload *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); - handle->slot_states[r] = (PTO2TaskSlotState *)ptr; + ring.slot_states = (PTO2TaskSlotState *)ptr; ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); } } @@ -190,6 +188,7 @@ void pto2_sm_init_header_per_ring( uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { header->rings[r].task_window_size = task_window_sizes[r]; + header->rings[r].task_window_mask = static_cast(task_window_sizes[r] - 1); header->rings[r].heap_size = heap_sizes[r]; header->rings[r].task_descriptors_offset = offset; offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); @@ -266,7 +265,7 @@ bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_ if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false; // Check pointer alignment - if ((uintptr_t)handle->task_descriptors[ring_id] % PTO2_ALIGN_SIZE != 0) return false; + if ((uintptr_t)h->rings[ring_id].task_descriptors % PTO2_ALIGN_SIZE != 0) return false; // Check flow control pointer sanity int32_t current = current_task_index.load(std::memory_order_acquire); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index de7d0b71b..d89fe698f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -75,13 +75,26 @@ static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be e /** * Per-ring shared memory header section. * - * Groups flow-control and layout info for a single ring to avoid parallel arrays. + * Groups flow-control, layout info, and per-ring data pointers for a single ring. + * Pointers are host-side only (set by pto2_sm_setup_pointers, invalid on device). */ -struct PTO2SharedMemoryRingHeader { +struct alignas(64) PTO2SharedMemoryRingHeader { PTO2RingFlowControl fc; + + // Layout metadata (set once at init) uint64_t task_window_size; + int32_t task_window_mask; uint64_t heap_size; uint64_t task_descriptors_offset; // Offset from SM base, in bytes + + // Per-ring data pointers (host-side, set by pto2_sm_setup_pointers) + PTO2TaskDescriptor *task_descriptors; + PTO2TaskPayload *task_payloads; + PTO2TaskSlotState *slot_states; + + PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; } + + PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { return slot_states[local_id & task_window_mask]; } }; /** @@ -118,8 +131,8 @@ struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader { }; static_assert( - sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0, - "PTO2SharedMemoryHeader must be aligned to cache line (PTO2_ALIGN_SIZE)" + sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0 || sizeof(PTO2SharedMemoryHeader) < 4096, + "PTO2SharedMemoryHeader should be reasonably sized" ); // ============================================================================= @@ -127,31 +140,17 @@ static_assert( // ============================================================================= /** - * Handle for shared memory access - * Provides both Orchestrator and Scheduler views of the same memory + * Handle for shared memory lifecycle management (create/destroy). + * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly. */ struct PTO2SharedMemoryHandle { void *sm_base; // Base address of shared memory uint64_t sm_size; // Total size of shared memory - // Quick pointers into shared memory regions (per-ring) PTO2SharedMemoryHeader *header; - PTO2TaskDescriptor *task_descriptors[PTO2_MAX_RING_DEPTH]; - PTO2TaskPayload *task_payloads[PTO2_MAX_RING_DEPTH]; - PTO2TaskSlotState *slot_states[PTO2_MAX_RING_DEPTH]; - - // Cached per-ring layout (avoids indirection through header) - uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; - int32_t task_window_masks[PTO2_MAX_RING_DEPTH]; // Ownership flag bool is_owner; // True if this handle allocated the memory - - PTO2TaskSlotState &get_slot_state_by_slot(int32_t ring_id, int32_t slot) { return slot_states[ring_id][slot]; } - - PTO2TaskSlotState &get_slot_state_by_task_id(int32_t ring_id, int32_t local_id) { - return slot_states[ring_id][local_id & task_window_masks[ring_id]]; - } }; // ============================================================================= From 0e3c3db3d5c5412dce6691b60b5b8525516ce213 Mon Sep 17 00:00:00 2001 From: poursoul Date: Tue, 21 Apr 2026 12:09:01 +0800 Subject: [PATCH 3/4] Refactor: replace RingSchedState cached members with ring pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Store PTO2SharedMemoryRingHeader* in RingSchedState instead of duplicating task_descriptors, slot_states, task_window_mask, task_window_size as separate members - Merge ring pointer with last_task_alive/advance_lock into cache line 0 (read-only pointer shares line with hot-path atomics) - Remove get_slot_state_by_task_id/get_slot_state_by_slot wrappers from RingSchedState — callers use ring->get_slot_state_by_*() - sync_to_sm/advance_ring_pointers no longer take ring parameter - Executor uses sched->ring_sched_states[r].ring directly instead of casting sm_base to header - Remove unused TaskAllocator::task()/task_by_slot() accessors - Runtime wait_for_tensor_ready accesses slot states via ring header --- .../aicpu/aicpu_executor.cpp | 19 ++++---- .../runtime/pto_ring_buffer.h | 8 ---- .../runtime/pto_runtime2.cpp | 7 +-- .../runtime/pto_scheduler.cpp | 23 ++++------ .../runtime/pto_scheduler.h | 46 ++++++------------- .../aicpu/aicpu_executor.cpp | 19 ++++---- .../runtime/pto_ring_buffer.h | 8 ---- .../runtime/pto_runtime2.cpp | 7 +-- .../runtime/pto_scheduler.cpp | 23 ++++------ .../runtime/pto_scheduler.h | 46 ++++++------------- 10 files changed, 66 insertions(+), 140 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index fa42b84d2..fd63f0351 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -558,7 +558,7 @@ struct AicpuExecutor { } __attribute__((noinline, cold)) void log_stall_diagnostics( - int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count, void *sm_base + int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count ) { int32_t c = completed_tasks_.load(std::memory_order_relaxed); DEV_ALWAYS( @@ -567,12 +567,12 @@ struct AicpuExecutor { ); CoreTracker &tracker = core_trackers_[thread_idx]; PTO2SchedulerState *sched = &rt->scheduler; - PTO2SharedMemoryHeader *sm_header_diag = static_cast(sm_base); int32_t cnt_ready = 0, cnt_waiting = 0, cnt_inflight = 0; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - int32_t ring_task_count = sm_header_diag->rings[r].fc.current_task_index.load(std::memory_order_relaxed); + PTO2SharedMemoryRingHeader &ring = *sched->ring_sched_states[r].ring; + int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed); for (int32_t si = 0; si < ring_task_count; si++) { - PTO2TaskSlotState &slot_state = sched->get_slot_state(r, si); + PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si); PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed); int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed); int32_t fi = slot_state.fanin_count; @@ -1892,14 +1892,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa CoreTracker &tracker = core_trackers_[thread_idx]; DEV_INFO("Thread %d: resolve_and_dispatch_pto2 entry", thread_idx); - void *sm_base = runtime->get_pto2_gm_sm_ptr(); - if (!sm_base) { - DEV_ERROR("PTO2 dispatch: sm_base is null"); + PTO2SharedMemoryHeader *header = rt->scheduler.sm_header; + if (!header) { + DEV_ERROR("PTO2 dispatch: header is null"); return -1; } - DEV_INFO("Thread %d: sm_base=%p", thread_idx, sm_base); - - PTO2SharedMemoryHeader *header = static_cast(sm_base); DEV_INFO( "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast(header), static_cast(header->rings[0].task_descriptors_offset), @@ -2140,7 +2137,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa } if (thread_idx == 0 && task_count > 0 && idle_iterations % STALL_LOG_INTERVAL == 0) { - log_stall_diagnostics(thread_idx, task_count, idle_iterations, last_progress_count, sm_base); + log_stall_diagnostics(thread_idx, task_count, idle_iterations, last_progress_count); } if (idle_iterations > MAX_IDLE_ITERATIONS) { return handle_timeout_exit( diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 05eeb32a9..684551af9 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -164,14 +164,6 @@ class PTO2TaskAllocator { } } - // ========================================================================= - // Task descriptor accessors - // ========================================================================= - - PTO2TaskDescriptor &task(int32_t task_id) const { return descriptors_[task_id & window_mask_]; } - - PTO2TaskDescriptor &task_by_slot(int32_t slot) const { return descriptors_[slot]; } - // ========================================================================= // State queries // ========================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index 6ff0bdef3..953a28cc3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -76,7 +76,9 @@ void pto2_rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, // Returns false on timeout (sets orch.fatal). MAYBE_UNINITIALIZED_BEGIN static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) { + PTO2TaskId owner = tensor.owner_task_id; PTO2OrchestratorState &orch = rt->orchestrator; + auto &ring = orch.sm_header->rings[owner.ring()]; // Collect producer slot states from both maps, deduplicated by pointer. // +1: one creator slot + up to PTO2_LOOKUP_MAX_RESULTS modifier slots. @@ -85,9 +87,8 @@ static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa int slot_count = 0; // Step A: creator retention — read owner directly from tensor metadata - PTO2TaskId owner = tensor.owner_task_id; if (owner.is_valid()) { - slots[slot_count++] = &rt->scheduler.ring_sched_states[owner.ring()].get_slot_state_by_task_id(owner.local()); + slots[slot_count++] = &ring.get_slot_state_by_task_id(owner.local()); } // Step B: modifier writer lookup (OverlapMap) @@ -95,7 +96,7 @@ static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa orch.tensor_map.lookup(tensor, lookup_result); for (int r = 0; r < lookup_result.count; r++) { PTO2TaskId pid = lookup_result.entries[r].entry->producer_task_id; - PTO2TaskSlotState *s = &rt->scheduler.ring_sched_states[pid.ring()].get_slot_state_by_task_id(pid.local()); + PTO2TaskSlotState *s = &ring.get_slot_state_by_task_id(pid.local()); bool already = false; for (int j = 0; j < slot_count; j++) { if (slots[j] == s) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp index 5e304b334..e4b57550b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp @@ -117,34 +117,27 @@ void pto2_ready_queue_destroy(PTO2ReadyQueue *queue) { // ============================================================================= bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) { - auto &ring = sm_header->rings[ring_id]; - task_descriptors = ring.task_descriptors; - task_window_size = ring.task_window_size; - task_window_mask = ring.task_window_mask; + ring = &sm_header->rings[ring_id]; last_task_alive = 0; advance_lock.store(0, std::memory_order_relaxed); - // Point into shared memory (allocated by pto2_sm_create) - slot_states = ring.slot_states; - // Initialize all per-task slot state fields. // bind() sets payload, task, ring_id — immutable after init, bound once // to their fixed shared-memory addresses. // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, // rest zero) so the first submit needs no reset. - PTO2TaskPayload *payloads = ring.task_payloads; - for (uint64_t i = 0; i < task_window_size; i++) { - slot_states[i].bind(&payloads[i], &task_descriptors[i], static_cast(ring_id)); - slot_states[i].reset_for_reuse(); - slot_states[i].fanin_count = 0; - slot_states[i].active_mask = 0; - slot_states[i].subtask_done_mask.store(0, std::memory_order_relaxed); + for (uint64_t i = 0; i < ring->task_window_size; i++) { + ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast(ring_id)); + ring->slot_states[i].reset_for_reuse(); + ring->slot_states[i].fanin_count = 0; + ring->slot_states[i].active_mask = 0; + ring->slot_states[i].subtask_done_mask.store(0, std::memory_order_relaxed); } return true; } -void PTO2SchedulerState::RingSchedState::destroy() { slot_states = nullptr; } +void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHeader *sm_header, int32_t dep_pool_capacity) { sched->sm_header = sm_header; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index 96ec42621..5f4bfba67 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -521,37 +521,25 @@ struct PTO2SchedulerState { // Per-ring state struct alignas(64) RingSchedState { - // --- Cache Line 0: Read-only after init (pointers + config) --- - PTO2TaskDescriptor *task_descriptors; - PTO2TaskSlotState *slot_states; - int32_t task_window_mask; - uint64_t task_window_size; - - // --- Cache Line 1: Multi-thread hot path (advance) --- - alignas(64) int32_t last_task_alive; + // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) --- + PTO2SharedMemoryRingHeader *ring; + int32_t last_task_alive; std::atomic advance_lock; // multi-thread CAS - // --- Cache Line 2+: Thread 0 only (wiring dep_pool) --- + // --- Cache Line 1+: Thread 0 only (wiring dep_pool) --- alignas(64) PTO2DepListPool dep_pool; bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id); void destroy(); - PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { - return slot_states[local_id & task_window_mask]; - } - PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; } - - void sync_to_sm(PTO2SharedMemoryRingHeader &ring) { - ring.fc.last_task_alive.store(last_task_alive, std::memory_order_release); - } + void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); } - void advance_ring_pointers(PTO2SharedMemoryRingHeader &ring) { - int32_t current_task_index = ring.fc.current_task_index.load(std::memory_order_acquire); + void advance_ring_pointers() { + int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire); int32_t old_last_task_alive = last_task_alive; while (last_task_alive < current_task_index) { - PTO2TaskSlotState &slot_state = get_slot_state_by_task_id(last_task_alive); + PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive); if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) { break; } @@ -564,10 +552,10 @@ struct PTO2SchedulerState { // them until the release store below. // Skips payload, task, ring_id — immutable after RingSchedState::init(). for (int32_t id = old_last_task_alive; id < last_task_alive; id++) { - get_slot_state_by_task_id(id).reset_for_reuse(); + ring->get_slot_state_by_task_id(id).reset_for_reuse(); } - sync_to_sm(ring); + sync_to_sm(); } } ring_sched_states[PTO2_MAX_RING_DEPTH]; @@ -648,7 +636,7 @@ struct PTO2SchedulerState { int32_t wfanin = ws->payload->fanin_actual_count; if (wfanin > 0 && rss.dep_pool.available() < wfanin) { - rss.dep_pool.reclaim(sm_header->rings[ring_id], rss.last_task_alive); + rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive); if (wfanin > 0 && rss.dep_pool.available() < wfanin) { break; // not enough dep_pool space — keep remainder for next call } @@ -697,14 +685,6 @@ struct PTO2SchedulerState { ws->dep_pool_mark = rss.dep_pool.top; } - PTO2TaskSlotState &get_slot_state(int32_t ring_id, int32_t local_id) { - return ring_sched_states[ring_id].get_slot_state_by_task_id(local_id); - } - - PTO2TaskSlotState &get_slot_state_by_slot(int32_t ring_id, int32_t slot) { - return ring_sched_states[ring_id].get_slot_state_by_slot(slot); - } - void check_and_handle_consumed(PTO2TaskSlotState &slot_state) { if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return; @@ -725,7 +705,7 @@ struct PTO2SchedulerState { if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_header->rings[ring_id]); + ring_sched_states[ring_id].advance_ring_pointers(); ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); } } @@ -759,7 +739,7 @@ struct PTO2SchedulerState { if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_header->rings[ring_id]); + ring_sched_states[ring_id].advance_ring_pointers(); ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); atomic_count += 2; // try-lock CAS + unlock store } else { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 9f0b81592..663cfca74 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -557,7 +557,7 @@ struct AicpuExecutor { } __attribute__((noinline, cold)) void log_stall_diagnostics( - int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count, void *sm_base + int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count ) { int32_t c = completed_tasks_.load(std::memory_order_relaxed); DEV_ALWAYS( @@ -566,12 +566,12 @@ struct AicpuExecutor { ); CoreTracker &tracker = core_trackers_[thread_idx]; PTO2SchedulerState *sched = &rt->scheduler; - PTO2SharedMemoryHeader *sm_header_diag = static_cast(sm_base); int32_t cnt_ready = 0, cnt_waiting = 0, cnt_inflight = 0; for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - int32_t ring_task_count = sm_header_diag->rings[r].fc.current_task_index.load(std::memory_order_relaxed); + PTO2SharedMemoryRingHeader &ring = *sched->ring_sched_states[r].ring; + int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed); for (int32_t si = 0; si < ring_task_count; si++) { - PTO2TaskSlotState &slot_state = sched->get_slot_state(r, si); + PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si); PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed); int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed); int32_t fi = slot_state.fanin_count; @@ -1881,14 +1881,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa CoreTracker &tracker = core_trackers_[thread_idx]; DEV_INFO("Thread %d: resolve_and_dispatch_pto2 entry", thread_idx); - void *sm_base = runtime->get_pto2_gm_sm_ptr(); - if (!sm_base) { - DEV_ERROR("PTO2 dispatch: sm_base is null"); + PTO2SharedMemoryHeader *header = rt->scheduler.sm_header; + if (!header) { + DEV_ERROR("PTO2 dispatch: header is null"); return -1; } - DEV_INFO("Thread %d: sm_base=%p", thread_idx, sm_base); - - PTO2SharedMemoryHeader *header = static_cast(sm_base); DEV_INFO( "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast(header), static_cast(header->rings[0].task_descriptors_offset), @@ -2129,7 +2126,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa } if (thread_idx == 0 && task_count > 0 && idle_iterations % STALL_LOG_INTERVAL == 0) { - log_stall_diagnostics(thread_idx, task_count, idle_iterations, last_progress_count, sm_base); + log_stall_diagnostics(thread_idx, task_count, idle_iterations, last_progress_count); } if (idle_iterations > MAX_IDLE_ITERATIONS) { return handle_timeout_exit( diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 05eeb32a9..684551af9 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -164,14 +164,6 @@ class PTO2TaskAllocator { } } - // ========================================================================= - // Task descriptor accessors - // ========================================================================= - - PTO2TaskDescriptor &task(int32_t task_id) const { return descriptors_[task_id & window_mask_]; } - - PTO2TaskDescriptor &task_by_slot(int32_t slot) const { return descriptors_[slot]; } - // ========================================================================= // State queries // ========================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index 6ff0bdef3..953a28cc3 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -76,7 +76,9 @@ void pto2_rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, // Returns false on timeout (sets orch.fatal). MAYBE_UNINITIALIZED_BEGIN static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) { + PTO2TaskId owner = tensor.owner_task_id; PTO2OrchestratorState &orch = rt->orchestrator; + auto &ring = orch.sm_header->rings[owner.ring()]; // Collect producer slot states from both maps, deduplicated by pointer. // +1: one creator slot + up to PTO2_LOOKUP_MAX_RESULTS modifier slots. @@ -85,9 +87,8 @@ static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa int slot_count = 0; // Step A: creator retention — read owner directly from tensor metadata - PTO2TaskId owner = tensor.owner_task_id; if (owner.is_valid()) { - slots[slot_count++] = &rt->scheduler.ring_sched_states[owner.ring()].get_slot_state_by_task_id(owner.local()); + slots[slot_count++] = &ring.get_slot_state_by_task_id(owner.local()); } // Step B: modifier writer lookup (OverlapMap) @@ -95,7 +96,7 @@ static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa orch.tensor_map.lookup(tensor, lookup_result); for (int r = 0; r < lookup_result.count; r++) { PTO2TaskId pid = lookup_result.entries[r].entry->producer_task_id; - PTO2TaskSlotState *s = &rt->scheduler.ring_sched_states[pid.ring()].get_slot_state_by_task_id(pid.local()); + PTO2TaskSlotState *s = &ring.get_slot_state_by_task_id(pid.local()); bool already = false; for (int j = 0; j < slot_count; j++) { if (slots[j] == s) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp index 5e304b334..e4b57550b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp @@ -117,34 +117,27 @@ void pto2_ready_queue_destroy(PTO2ReadyQueue *queue) { // ============================================================================= bool PTO2SchedulerState::RingSchedState::init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id) { - auto &ring = sm_header->rings[ring_id]; - task_descriptors = ring.task_descriptors; - task_window_size = ring.task_window_size; - task_window_mask = ring.task_window_mask; + ring = &sm_header->rings[ring_id]; last_task_alive = 0; advance_lock.store(0, std::memory_order_relaxed); - // Point into shared memory (allocated by pto2_sm_create) - slot_states = ring.slot_states; - // Initialize all per-task slot state fields. // bind() sets payload, task, ring_id — immutable after init, bound once // to their fixed shared-memory addresses. // reset_for_reuse() sets dynamic fields to reclaim defaults (fanout_count=1, // rest zero) so the first submit needs no reset. - PTO2TaskPayload *payloads = ring.task_payloads; - for (uint64_t i = 0; i < task_window_size; i++) { - slot_states[i].bind(&payloads[i], &task_descriptors[i], static_cast(ring_id)); - slot_states[i].reset_for_reuse(); - slot_states[i].fanin_count = 0; - slot_states[i].active_mask = 0; - slot_states[i].subtask_done_mask.store(0, std::memory_order_relaxed); + for (uint64_t i = 0; i < ring->task_window_size; i++) { + ring->slot_states[i].bind(&ring->task_payloads[i], &ring->task_descriptors[i], static_cast(ring_id)); + ring->slot_states[i].reset_for_reuse(); + ring->slot_states[i].fanin_count = 0; + ring->slot_states[i].active_mask = 0; + ring->slot_states[i].subtask_done_mask.store(0, std::memory_order_relaxed); } return true; } -void PTO2SchedulerState::RingSchedState::destroy() { slot_states = nullptr; } +void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } bool pto2_scheduler_init(PTO2SchedulerState *sched, PTO2SharedMemoryHeader *sm_header, int32_t dep_pool_capacity) { sched->sm_header = sm_header; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index dc4922c6b..3859a89ba 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -524,37 +524,25 @@ struct PTO2SchedulerState { // Per-ring state struct alignas(64) RingSchedState { - // --- Cache Line 0: Read-only after init (pointers + config) --- - PTO2TaskDescriptor *task_descriptors; - PTO2TaskSlotState *slot_states; - int32_t task_window_mask; - uint64_t task_window_size; - - // --- Cache Line 1: Multi-thread hot path (advance) --- - alignas(64) int32_t last_task_alive; + // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) --- + PTO2SharedMemoryRingHeader *ring; + int32_t last_task_alive; std::atomic advance_lock; // multi-thread CAS - // --- Cache Line 2+: Thread 0 only (wiring dep_pool) --- + // --- Cache Line 1+: Thread 0 only (wiring dep_pool) --- alignas(64) PTO2DepListPool dep_pool; bool init(PTO2SharedMemoryHeader *sm_header, int32_t ring_id); void destroy(); - PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { - return slot_states[local_id & task_window_mask]; - } - PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; } - - void sync_to_sm(PTO2SharedMemoryRingHeader &ring) { - ring.fc.last_task_alive.store(last_task_alive, std::memory_order_release); - } + void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); } - void advance_ring_pointers(PTO2SharedMemoryRingHeader &ring) { - int32_t current_task_index = ring.fc.current_task_index.load(std::memory_order_acquire); + void advance_ring_pointers() { + int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire); int32_t old_last_task_alive = last_task_alive; while (last_task_alive < current_task_index) { - PTO2TaskSlotState &slot_state = get_slot_state_by_task_id(last_task_alive); + PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive); if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) { break; } @@ -567,10 +555,10 @@ struct PTO2SchedulerState { // them until the release store below. // Skips payload, task, ring_id — immutable after RingSchedState::init(). for (int32_t id = old_last_task_alive; id < last_task_alive; id++) { - get_slot_state_by_task_id(id).reset_for_reuse(); + ring->get_slot_state_by_task_id(id).reset_for_reuse(); } - sync_to_sm(ring); + sync_to_sm(); } } ring_sched_states[PTO2_MAX_RING_DEPTH]; @@ -651,7 +639,7 @@ struct PTO2SchedulerState { int32_t wfanin = ws->payload->fanin_actual_count; if (wfanin > 0 && rss.dep_pool.available() < wfanin) { - rss.dep_pool.reclaim(sm_header->rings[ring_id], rss.last_task_alive); + rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive); if (wfanin > 0 && rss.dep_pool.available() < wfanin) { break; // not enough dep_pool space — keep remainder for next call } @@ -700,14 +688,6 @@ struct PTO2SchedulerState { ws->dep_pool_mark = rss.dep_pool.top; } - PTO2TaskSlotState &get_slot_state(int32_t ring_id, int32_t local_id) { - return ring_sched_states[ring_id].get_slot_state_by_task_id(local_id); - } - - PTO2TaskSlotState &get_slot_state_by_slot(int32_t ring_id, int32_t slot) { - return ring_sched_states[ring_id].get_slot_state_by_slot(slot); - } - void check_and_handle_consumed(PTO2TaskSlotState &slot_state) { if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return; @@ -728,7 +708,7 @@ struct PTO2SchedulerState { if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_header->rings[ring_id]); + ring_sched_states[ring_id].advance_ring_pointers(); ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); } } @@ -762,7 +742,7 @@ struct PTO2SchedulerState { if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_header->rings[ring_id]); + ring_sched_states[ring_id].advance_ring_pointers(); ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); atomic_count += 2; // try-lock CAS + unlock store } else { From d1fc66db7029615d8aa2298dec0b5443917b5a70 Mon Sep 17 00:00:00 2001 From: poursoul Date: Tue, 21 Apr 2026 15:24:31 +0800 Subject: [PATCH 4/4] Refactor: clean up fanin builder and add RingHeader accessors - PTO2FaninBuilder: use reference member + constructor instead of pointer + manual init; simplify pto2_append_fanin_or_fail signature - pto2_for_each_fanin_storage: take PTO2FaninPool& instead of pointer - Move fanin metadata write and fanout_count increment before payload.init() in submit path (STEP 5) - FaninPool::reclaim: access payload directly via get_payload_by_task_id instead of indirecting through slot_state->payload - Add get_task_by_*/get_payload_by_* accessors to RingHeader - Applied to a2a3 and a5 tensormap_and_ringbuffer variants --- .../runtime/pto_orchestrator.cpp | 82 +++++++--------- .../runtime/pto_ring_buffer.cpp | 11 +-- .../runtime/pto_ring_buffer.h | 22 ++--- .../runtime/pto_runtime2.cpp | 5 +- .../runtime/pto_shared_memory.h | 10 +- .../runtime/pto_orchestrator.cpp | 93 +++++++------------ .../runtime/pto_ring_buffer.cpp | 11 +-- .../runtime/pto_ring_buffer.h | 22 ++--- .../runtime/pto_runtime2.cpp | 5 +- .../runtime/pto_shared_memory.h | 10 +- 10 files changed, 119 insertions(+), 152 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index fb99efaa5..4bebbaf7a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -170,10 +170,14 @@ void pto2_orch_report_fatal(PTO2OrchestratorState *orch, int32_t error_code, con } struct PTO2FaninBuilder { - PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP]; + PTO2FaninBuilder(PTO2FaninPool &spill_pool) : + count(0), + spill_start(0), + spill_pool(spill_pool) {} int32_t count{0}; int32_t spill_start{0}; - PTO2FaninPool *spill_pool{nullptr}; + PTO2FaninPool &spill_pool; + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP]; template PTO2FaninForEachReturn for_each(Fn &&fn) const { @@ -197,8 +201,7 @@ struct PTO2FaninBuilder { }; static bool pto2_append_fanin_or_fail( - PTO2OrchestratorState *orch, PTO2TaskId task_id, int32_t tensor_arg_index, TensorArgType ptype, - PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, uint8_t ring_id, const char *reason + PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, uint8_t ring_id ) { if (fanin_builder->contains(prod_state)) { return true; @@ -209,21 +212,7 @@ static bool pto2_append_fanin_or_fail( return true; } - if (fanin_builder->spill_pool == nullptr) { - LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Fanin Spill Builder Misconfigured!"); - LOG_ERROR("========================================"); - LOG_ERROR("Missing fanin spill pool while appending dynamic fanin."); - LOG_ERROR(" task_id.raw: %" PRIu64, task_id.raw); - LOG_ERROR(" tensor_arg_index: %d", tensor_arg_index); - LOG_ERROR(" tensor_arg_type: %d", static_cast(ptype)); - LOG_ERROR(" reason: %s", reason); - LOG_ERROR("========================================"); - pto2_orch_mark_fatal(orch, PTO2_ERROR_DEPENDENCY_OVERFLOW); - return false; - } - - PTO2FaninPool &fanin_pool = *fanin_builder->spill_pool; + PTO2FaninPool &fanin_pool = fanin_builder->spill_pool; fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1); int32_t spill_idx = fanin_pool.top; PTO2FaninSpillEntry *entry = fanin_pool.alloc(); @@ -578,10 +567,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke PTO2TaskPayload &payload = *prepared.payload; result.set_task_id(task_id); - PTO2FaninBuilder fanin_builder; - fanin_builder.count = 0; - fanin_builder.spill_start = 0; - fanin_builder.spill_pool = &orch->rings[ring_id].fanin_pool; + PTO2FaninBuilder fanin_builder(orch->rings[ring_id].fanin_pool); CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, task_id.raw); @@ -615,9 +601,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke if (owner.is_valid()) { PTO2TaskSlotState *prod_state = &orch->sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local()); - if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, ring_id, "creator retention" - )) { + if (!pto2_append_fanin_or_fail(orch, prod_state, &fanin_builder, ring_id)) { return result; } } @@ -639,9 +623,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke auto prod_ring = entry.producer_task_id.ring(); auto prod_local = entry.producer_task_id.local(); PTO2TaskSlotState *prod_state = &orch->sm_header->rings[prod_ring].get_slot_state_by_task_id(prod_local); - if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, ring_id, "overlap lookup" - )) { + if (!pto2_append_fanin_or_fail(orch, prod_state, &fanin_builder, ring_id)) { return result; } if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) { @@ -666,7 +648,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, task_id.raw); - // === STEP 5: Batch-write to GM (single cache line burst) === + // === STEP 5: Batch-write to GM (single cache line burst) + Record fanin metadata === // Deferred from allocation phase to avoid scattered GM writes that get // evicted by TensorMap lookup/insert cache pressure. __builtin_prefetch(&task, 1, 1); @@ -677,6 +659,24 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke task.packed_buffer_base = prepared.alloc_result.packed_base; task.packed_buffer_end = prepared.alloc_result.packed_end; + // Increment fanout_count on each producer (no lock — only orch writes this field). + // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count. + pto2_for_each_fanin_storage( + fanin_builder.inline_slots, fanin_builder.count, fanin_builder.spill_start, fanin_builder.spill_pool, + [](PTO2TaskSlotState *producer) { + producer->fanout_count++; + } + ); + + int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP); + // Store fanin metadata in payload for scheduler to iterate + payload.fanin_actual_count = fanin_builder.count; + payload.fanin_spill_start = fanin_builder.spill_start; + payload.fanin_spill_pool = &fanin_builder.spill_pool; + for (int i = 0; i < inline_count; i++) { + payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; + } + payload.init(args, result, prepared.alloc_result, layout); CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw); @@ -684,28 +684,10 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke g_orch_args_atomic_count += 2; // fanout_lock.store + fanout_count.store #endif - // === STEP 6: Record fanin metadata + push to wiring queue === + // === STEP 6: push to wiring queue === // Deferred wiring: orchestrator only stores dependency metadata and increments // fanout_count. The actual fanout_head wiring (lock + dep_pool + early_finished) // is handled asynchronously by scheduler thread 0 via the wiring queue. - int32_t fanin_count = fanin_builder.count; - int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP); - int32_t spill_count = fanin_count - inline_count; - - // Store fanin metadata in payload for scheduler to iterate - payload.fanin_actual_count = fanin_count; - payload.fanin_spill_start = (spill_count > 0) ? fanin_builder.spill_start : 0; - payload.fanin_spill_pool = (spill_count > 0) ? fanin_builder.spill_pool : nullptr; - for (int i = 0; i < inline_count; i++) { - payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; - } - - // Increment fanout_count on each producer (no lock — only orch writes this field). - // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count. - pto2_for_each_fanin_slot_state(payload, [](PTO2TaskSlotState *producer) { - producer->fanout_count += 1; - }); - // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness while (!sched->wiring.queue.push(&cur_slot_state)) { SPIN_WAIT_HINT(); @@ -791,7 +773,7 @@ TaskOutputTensors pto2_alloc_tensors(PTO2OrchestratorState *orch, const Arg &arg payload.init(args, outputs, prepared.alloc_result, layout); payload.fanin_actual_count = 0; payload.fanin_spill_start = 0; - payload.fanin_spill_pool = nullptr; + payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool; CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, prepared.task_id.raw); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp index 94493bddb..139d42c93 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp @@ -32,16 +32,15 @@ void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_ta int32_t scan_end = sm_last_task_alive; for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) { - PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(task_id); - PTO2TaskPayload *payload = slot_state.payload; - if (payload == nullptr || payload->fanin_spill_pool != this) { + PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id); + if (payload.fanin_spill_pool != this) { continue; } - int32_t inline_count = std::min(payload->fanin_actual_count, PTO2_FANIN_INLINE_CAP); - int32_t spill_edge_count = payload->fanin_actual_count - inline_count; + int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP); + int32_t spill_edge_count = payload.fanin_actual_count - inline_count; if (spill_edge_count > 0) { - advance_tail(payload->fanin_spill_start + spill_edge_count); + advance_tail(payload.fanin_spill_start + spill_edge_count); } } reclaim_task_cursor = scan_end; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 684551af9..445b6c73a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -440,7 +440,7 @@ using PTO2FaninForEachReturn = std::conditional_t inline PTO2FaninForEachReturn pto2_for_each_fanin_storage( - InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool *spill_pool, Fn &&fn + InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn ) { using FaninCallbackResult = PTO2FaninCallbackResult; static_assert( @@ -459,17 +459,16 @@ inline PTO2FaninForEachReturn pto2_for_each_fanin_storage( return; } - PTO2FaninPool *pool = spill_pool; - int32_t start_idx = spill_start % pool->capacity; - int32_t first_count = std::min(spill_count, pool->capacity - start_idx); - PTO2FaninSpillEntry *first = pool->base + start_idx; + int32_t start_idx = spill_start % spill_pool.capacity; + int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx); + PTO2FaninSpillEntry *first = spill_pool.base + start_idx; for (int32_t i = 0; i < first_count; i++) { fn(first[i].slot_state); } int32_t second_count = spill_count - first_count; for (int32_t i = 0; i < second_count; i++) { - fn(pool->base[i].slot_state); + fn(spill_pool.base[i].slot_state); } return; } else { @@ -485,10 +484,9 @@ inline PTO2FaninForEachReturn pto2_for_each_fanin_storage( return true; } - PTO2FaninPool *pool = spill_pool; - int32_t start_idx = spill_start % pool->capacity; - int32_t first_count = std::min(spill_count, pool->capacity - start_idx); - PTO2FaninSpillEntry *first = pool->base + start_idx; + int32_t start_idx = spill_start % spill_pool.capacity; + int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx); + PTO2FaninSpillEntry *first = spill_pool.base + start_idx; for (int32_t i = 0; i < first_count; i++) { if (!fn(first[i].slot_state)) { return false; @@ -497,7 +495,7 @@ inline PTO2FaninForEachReturn pto2_for_each_fanin_storage( int32_t second_count = spill_count - first_count; for (int32_t i = 0; i < second_count; i++) { - if (!fn(pool->base[i].slot_state)) { + if (!fn(spill_pool.base[i].slot_state)) { return false; } } @@ -509,7 +507,7 @@ template inline PTO2FaninForEachReturn pto2_for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) { return pto2_for_each_fanin_storage( payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start, - payload.fanin_spill_pool, static_cast(fn) + *payload.fanin_spill_pool, static_cast(fn) ); } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index 953a28cc3..08cd7fabc 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -78,7 +78,6 @@ MAYBE_UNINITIALIZED_BEGIN static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) { PTO2TaskId owner = tensor.owner_task_id; PTO2OrchestratorState &orch = rt->orchestrator; - auto &ring = orch.sm_header->rings[owner.ring()]; // Collect producer slot states from both maps, deduplicated by pointer. // +1: one creator slot + up to PTO2_LOOKUP_MAX_RESULTS modifier slots. @@ -88,7 +87,7 @@ static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa // Step A: creator retention — read owner directly from tensor metadata if (owner.is_valid()) { - slots[slot_count++] = &ring.get_slot_state_by_task_id(owner.local()); + slots[slot_count++] = &orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local()); } // Step B: modifier writer lookup (OverlapMap) @@ -96,7 +95,7 @@ static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa orch.tensor_map.lookup(tensor, lookup_result); for (int r = 0; r < lookup_result.count; r++) { PTO2TaskId pid = lookup_result.entries[r].entry->producer_task_id; - PTO2TaskSlotState *s = &ring.get_slot_state_by_task_id(pid.local()); + PTO2TaskSlotState *s = &orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local()); bool already = false; for (int j = 0; j < slot_count; j++) { if (slots[j] == s) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index af538822f..3e1e89856 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -92,6 +92,14 @@ struct alignas(64) PTO2SharedMemoryRingHeader { PTO2TaskPayload *task_payloads; PTO2TaskSlotState *slot_states; + PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; } + + PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) { return task_descriptors[local_id & task_window_mask]; } + + PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; } + + PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[local_id & task_window_mask]; } + PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; } PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { return slot_states[local_id & task_window_mask]; } @@ -131,7 +139,7 @@ struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader { }; static_assert( - sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0 || sizeof(PTO2SharedMemoryHeader) < 4096, + (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096), "PTO2SharedMemoryHeader should be reasonably sized" ); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index eba7bd982..f13ab68c6 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -170,10 +170,14 @@ void pto2_orch_report_fatal(PTO2OrchestratorState *orch, int32_t error_code, con } struct PTO2FaninBuilder { - PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP]; + PTO2FaninBuilder(PTO2FaninPool &spill_pool) : + count(0), + spill_start(0), + spill_pool(spill_pool) {} int32_t count{0}; int32_t spill_start{0}; - PTO2FaninPool *spill_pool{nullptr}; + PTO2FaninPool &spill_pool; + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP]; template PTO2FaninForEachReturn for_each(Fn &&fn) const { @@ -197,8 +201,7 @@ struct PTO2FaninBuilder { }; static bool pto2_append_fanin_or_fail( - PTO2OrchestratorState *orch, PTO2TaskId task_id, int32_t tensor_arg_index, TensorArgType ptype, - PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, uint8_t ring_id, const char *reason + PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, uint8_t ring_id ) { if (fanin_builder->contains(prod_state)) { return true; @@ -209,21 +212,7 @@ static bool pto2_append_fanin_or_fail( return true; } - if (fanin_builder->spill_pool == nullptr) { - LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Fanin Spill Builder Misconfigured!"); - LOG_ERROR("========================================"); - LOG_ERROR("Missing fanin spill pool while appending dynamic fanin."); - LOG_ERROR(" task_id.raw: %" PRIu64, task_id.raw); - LOG_ERROR(" tensor_arg_index: %d", tensor_arg_index); - LOG_ERROR(" tensor_arg_type: %d", static_cast(ptype)); - LOG_ERROR(" reason: %s", reason); - LOG_ERROR("========================================"); - pto2_orch_mark_fatal(orch, PTO2_ERROR_DEPENDENCY_OVERFLOW); - return false; - } - - PTO2FaninPool &fanin_pool = *fanin_builder->spill_pool; + PTO2FaninPool &fanin_pool = fanin_builder->spill_pool; fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1); int32_t spill_idx = fanin_pool.top; PTO2FaninSpillEntry *entry = fanin_pool.alloc(); @@ -579,10 +568,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke PTO2TaskPayload &payload = *prepared.payload; result.set_task_id(task_id); - PTO2FaninBuilder fanin_builder; - fanin_builder.count = 0; - fanin_builder.spill_start = 0; - fanin_builder.spill_pool = &orch->rings[ring_id].fanin_pool; + PTO2FaninBuilder fanin_builder(orch->rings[ring_id].fanin_pool); CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, task_id.raw); @@ -616,9 +602,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke if (owner.is_valid()) { PTO2TaskSlotState *prod_state = &orch->sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local()); - if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, ring_id, "creator retention" - )) { + if (!pto2_append_fanin_or_fail(orch, prod_state, &fanin_builder, ring_id)) { return result; } } @@ -640,9 +624,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke auto prod_ring = entry.producer_task_id.ring(); auto prod_local = entry.producer_task_id.local(); PTO2TaskSlotState *prod_state = &orch->sm_header->rings[prod_ring].get_slot_state_by_task_id(prod_local); - if (!pto2_append_fanin_or_fail( - orch, task_id, i, ptype, prod_state, &fanin_builder, ring_id, "overlap lookup" - )) { + if (!pto2_append_fanin_or_fail(orch, prod_state, &fanin_builder, ring_id)) { return result; } if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) { @@ -667,7 +649,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, task_id.raw); - // === STEP 5: Batch-write to GM (single cache line burst) === + // === STEP 5: Batch-write to GM (single cache line burst) + Record fanin metadata === // Deferred from allocation phase to avoid scattered GM writes that get // evicted by TensorMap lookup/insert cache pressure. __builtin_prefetch(&task, 1, 1); @@ -678,6 +660,24 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke task.packed_buffer_base = prepared.alloc_result.packed_base; task.packed_buffer_end = prepared.alloc_result.packed_end; + // Increment fanout_count on each producer (no lock — only orch writes this field). + // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count. + pto2_for_each_fanin_storage( + fanin_builder.inline_slots, fanin_builder.count, fanin_builder.spill_start, fanin_builder.spill_pool, + [](PTO2TaskSlotState *producer) { + producer->fanout_count++; + } + ); + + int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP); + // Store fanin metadata in payload for scheduler to iterate + payload.fanin_actual_count = fanin_builder.count; + payload.fanin_spill_start = fanin_builder.spill_start; + payload.fanin_spill_pool = &fanin_builder.spill_pool; + for (int i = 0; i < inline_count; i++) { + payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; + } + payload.init(args, result, prepared.alloc_result, layout); CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw); @@ -685,36 +685,13 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke g_orch_args_atomic_count += 2; // fanout_lock.store + fanout_count.store #endif - // === STEP 6: Record fanin metadata + push to wiring queue === + // === STEP 6: push to wiring queue === // Deferred wiring: orchestrator only stores dependency metadata and increments // fanout_count. The actual fanout_head wiring (lock + dep_pool + early_finished) // is handled asynchronously by scheduler thread 0 via the wiring queue. - { - int32_t fanin_count = fanin_builder.count; - int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP); - int32_t spill_count = fanin_count - inline_count; - - // Store fanin metadata in payload for scheduler to iterate - payload.fanin_actual_count = fanin_count; - payload.fanin_spill_start = (spill_count > 0) ? fanin_builder.spill_start : 0; - payload.fanin_spill_pool = (spill_count > 0) ? fanin_builder.spill_pool : nullptr; - for (int i = 0; i < inline_count; i++) { - payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; - } - - // Increment fanout_count on each producer (no lock — only orch writes this field). - // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count. - pto2_for_each_fanin_slot_state(payload, [](PTO2TaskSlotState *producer) { - producer->fanout_count += 1; - }); - - // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness - while (!sched->wiring.queue.push(&cur_slot_state)) { - SPIN_WAIT_HINT(); - } -#if PTO2_ORCH_PROFILING - g_orch_fanin_atomic_count += 0; // No lock/atomic ops in submit hot path -#endif + // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness + while (!sched->wiring.queue.push(&cur_slot_state)) { + SPIN_WAIT_HINT(); } CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, task_id.raw); @@ -795,7 +772,7 @@ TaskOutputTensors pto2_alloc_tensors(PTO2OrchestratorState *orch, const Arg &arg payload.init(args, outputs, prepared.alloc_result, layout); payload.fanin_actual_count = 0; payload.fanin_spill_start = 0; - payload.fanin_spill_pool = nullptr; + payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool; CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, prepared.task_id.raw); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp index 94493bddb..139d42c93 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp @@ -32,16 +32,15 @@ void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_ta int32_t scan_end = sm_last_task_alive; for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) { - PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(task_id); - PTO2TaskPayload *payload = slot_state.payload; - if (payload == nullptr || payload->fanin_spill_pool != this) { + PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id); + if (payload.fanin_spill_pool != this) { continue; } - int32_t inline_count = std::min(payload->fanin_actual_count, PTO2_FANIN_INLINE_CAP); - int32_t spill_edge_count = payload->fanin_actual_count - inline_count; + int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP); + int32_t spill_edge_count = payload.fanin_actual_count - inline_count; if (spill_edge_count > 0) { - advance_tail(payload->fanin_spill_start + spill_edge_count); + advance_tail(payload.fanin_spill_start + spill_edge_count); } } reclaim_task_cursor = scan_end; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 684551af9..445b6c73a 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -440,7 +440,7 @@ using PTO2FaninForEachReturn = std::conditional_t inline PTO2FaninForEachReturn pto2_for_each_fanin_storage( - InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool *spill_pool, Fn &&fn + InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn ) { using FaninCallbackResult = PTO2FaninCallbackResult; static_assert( @@ -459,17 +459,16 @@ inline PTO2FaninForEachReturn pto2_for_each_fanin_storage( return; } - PTO2FaninPool *pool = spill_pool; - int32_t start_idx = spill_start % pool->capacity; - int32_t first_count = std::min(spill_count, pool->capacity - start_idx); - PTO2FaninSpillEntry *first = pool->base + start_idx; + int32_t start_idx = spill_start % spill_pool.capacity; + int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx); + PTO2FaninSpillEntry *first = spill_pool.base + start_idx; for (int32_t i = 0; i < first_count; i++) { fn(first[i].slot_state); } int32_t second_count = spill_count - first_count; for (int32_t i = 0; i < second_count; i++) { - fn(pool->base[i].slot_state); + fn(spill_pool.base[i].slot_state); } return; } else { @@ -485,10 +484,9 @@ inline PTO2FaninForEachReturn pto2_for_each_fanin_storage( return true; } - PTO2FaninPool *pool = spill_pool; - int32_t start_idx = spill_start % pool->capacity; - int32_t first_count = std::min(spill_count, pool->capacity - start_idx); - PTO2FaninSpillEntry *first = pool->base + start_idx; + int32_t start_idx = spill_start % spill_pool.capacity; + int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx); + PTO2FaninSpillEntry *first = spill_pool.base + start_idx; for (int32_t i = 0; i < first_count; i++) { if (!fn(first[i].slot_state)) { return false; @@ -497,7 +495,7 @@ inline PTO2FaninForEachReturn pto2_for_each_fanin_storage( int32_t second_count = spill_count - first_count; for (int32_t i = 0; i < second_count; i++) { - if (!fn(pool->base[i].slot_state)) { + if (!fn(spill_pool.base[i].slot_state)) { return false; } } @@ -509,7 +507,7 @@ template inline PTO2FaninForEachReturn pto2_for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) { return pto2_for_each_fanin_storage( payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start, - payload.fanin_spill_pool, static_cast(fn) + *payload.fanin_spill_pool, static_cast(fn) ); } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index 953a28cc3..08cd7fabc 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -78,7 +78,6 @@ MAYBE_UNINITIALIZED_BEGIN static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) { PTO2TaskId owner = tensor.owner_task_id; PTO2OrchestratorState &orch = rt->orchestrator; - auto &ring = orch.sm_header->rings[owner.ring()]; // Collect producer slot states from both maps, deduplicated by pointer. // +1: one creator slot + up to PTO2_LOOKUP_MAX_RESULTS modifier slots. @@ -88,7 +87,7 @@ static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa // Step A: creator retention — read owner directly from tensor metadata if (owner.is_valid()) { - slots[slot_count++] = &ring.get_slot_state_by_task_id(owner.local()); + slots[slot_count++] = &orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local()); } // Step B: modifier writer lookup (OverlapMap) @@ -96,7 +95,7 @@ static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa orch.tensor_map.lookup(tensor, lookup_result); for (int r = 0; r < lookup_result.count; r++) { PTO2TaskId pid = lookup_result.entries[r].entry->producer_task_id; - PTO2TaskSlotState *s = &ring.get_slot_state_by_task_id(pid.local()); + PTO2TaskSlotState *s = &orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local()); bool already = false; for (int j = 0; j < slot_count; j++) { if (slots[j] == s) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index d89fe698f..180b7b61d 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -92,6 +92,14 @@ struct alignas(64) PTO2SharedMemoryRingHeader { PTO2TaskPayload *task_payloads; PTO2TaskSlotState *slot_states; + PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; } + + PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) { return task_descriptors[local_id & task_window_mask]; } + + PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; } + + PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[local_id & task_window_mask]; } + PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; } PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { return slot_states[local_id & task_window_mask]; } @@ -131,7 +139,7 @@ struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader { }; static_assert( - sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0 || sizeof(PTO2SharedMemoryHeader) < 4096, + (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096), "PTO2SharedMemoryHeader should be reasonably sized" );