diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 53dcfa6d1..bbb72e764 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -451,7 +451,7 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
         pto2_init_complete_.store(true, std::memory_order_release);
     } else {
         while (!pto2_init_complete_.load(std::memory_order_acquire)) {
-            std::this_thread::yield();
+            SPIN_WAIT_HINT();
         }
     }
 
@@ -1082,7 +1082,7 @@ int AicpuExecutor::run(Runtime* runtime) {
 
             // Wait for scheduler's one-time init to complete
             while (!pto2_init_complete_.load(std::memory_order_acquire)) {
-                std::this_thread::yield();
+                SPIN_WAIT_HINT();
             }
 
             // Call orchestration function wrapped in an outer scope
@@ -1209,7 +1209,7 @@ int AicpuExecutor::run(Runtime* runtime) {
             // runtime. Scheduler threads read tensor_data pointers from task descriptors
             // that point into the task descriptor's inline TensorData — freeing early is use-after-free.
             while (finished_count_.load(std::memory_order_acquire) < thread_num_ - 1) {
-                std::this_thread::yield();
+                SPIN_WAIT_HINT();
             }
             DEV_INFO("Thread %d: All scheduler threads finished, destroying runtime", thread_idx);
 
@@ -1224,7 +1224,7 @@ int AicpuExecutor::run(Runtime* runtime) {
         // Device orchestration: wait for Thread 3 to initialize SM header
         if (!runtime->get_orch_built_on_host()) {
             while (!runtime_init_ready_.load(std::memory_order_acquire)) {
-                std::this_thread::yield();
+                SPIN_WAIT_HINT();
             }
         }
         always_assert(rt != nullptr);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp
index e2cfeda4c..a98c91f28 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp
@@ -122,10 +122,9 @@ PTO2SharedMemoryHandle* pto2_sm_create_from_buffer(void* sm_base,
 
     // Zero task descriptors and dep list pool.
     // On multi-round execution the SM buffer may be reused at the same device
-    // address.  PTO2TaskDescriptor contains Tensor members whose move-assignment
-    // calls TensorPool::deref(old_index).  Stale nonzero indices from a previous
-    // round would corrupt the fresh TensorPool.  Zeroing ensures index==0 (no-op
-    // deref) for every slot before new tasks are written.
+    // address.  Stale fanout_head/fanin_head pointers and fanout_lock atomics
+    // from a previous round would corrupt the scheduler's dependency tracking.
+    // Zeroing ensures clean state for every slot before new tasks are written.
     memset(handle->task_descriptors, 0,
            task_window_size * sizeof(PTO2TaskDescriptor));
     // Skip slot 0 (sentinel: task_id=-1, next=nullptr; deref(0) is a no-op)