diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 53dcfa6d1..bbb72e764 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -451,7 +451,7 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx, pto2_init_complete_.store(true, std::memory_order_release); } else { while (!pto2_init_complete_.load(std::memory_order_acquire)) { - std::this_thread::yield(); + SPIN_WAIT_HINT(); } } @@ -1082,7 +1082,7 @@ int AicpuExecutor::run(Runtime* runtime) { // Wait for scheduler's one-time init to complete while (!pto2_init_complete_.load(std::memory_order_acquire)) { - std::this_thread::yield(); + SPIN_WAIT_HINT(); } // Call orchestration function wrapped in an outer scope @@ -1209,7 +1209,7 @@ int AicpuExecutor::run(Runtime* runtime) { // runtime. Scheduler threads read tensor_data pointers from task descriptors // that point into the task descriptor's inline TensorData — freeing early is use-after-free. while (finished_count_.load(std::memory_order_acquire) < thread_num_ - 1) { - std::this_thread::yield(); + SPIN_WAIT_HINT(); } DEV_INFO("Thread %d: All scheduler threads finished, destroying runtime", thread_idx); @@ -1224,7 +1224,7 @@ int AicpuExecutor::run(Runtime* runtime) { // Device orchestration: wait for Thread 3 to initialize SM header if (!runtime->get_orch_built_on_host()) { while (!runtime_init_ready_.load(std::memory_order_acquire)) { - std::this_thread::yield(); + SPIN_WAIT_HINT(); } } always_assert(rt != nullptr); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp index e2cfeda4c..a98c91f28 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp @@ -122,10 +122,9 @@ PTO2SharedMemoryHandle* pto2_sm_create_from_buffer(void* sm_base, // Zero task descriptors and dep list pool. // On multi-round execution the SM buffer may be reused at the same device - // address. PTO2TaskDescriptor contains Tensor members whose move-assignment - // calls TensorPool::deref(old_index). Stale nonzero indices from a previous - // round would corrupt the fresh TensorPool. Zeroing ensures index==0 (no-op - // deref) for every slot before new tasks are written. + // address. Stale fanout_head/fanin_head pointers and fanout_lock atomics + // from a previous round would corrupt the scheduler's dependency tracking. + // Zeroing ensures clean state for every slot before new tasks are written. memset(handle->task_descriptors, 0, task_window_size * sizeof(PTO2TaskDescriptor)); // Skip slot 0 (sentinel: task_id=-1, next=nullptr; deref(0) is a no-op)