Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
pto2_init_complete_.store(true, std::memory_order_release);
} else {
while (!pto2_init_complete_.load(std::memory_order_acquire)) {
std::this_thread::yield();
SPIN_WAIT_HINT();
Comment thread
ChaoWao marked this conversation as resolved.
}
}

Expand Down Expand Up @@ -1082,7 +1082,7 @@ int AicpuExecutor::run(Runtime* runtime) {

// Wait for scheduler's one-time init to complete
while (!pto2_init_complete_.load(std::memory_order_acquire)) {
std::this_thread::yield();
SPIN_WAIT_HINT();
Comment thread
ChaoWao marked this conversation as resolved.
}

// Call orchestration function wrapped in an outer scope
Expand Down Expand Up @@ -1209,7 +1209,7 @@ int AicpuExecutor::run(Runtime* runtime) {
// runtime. Scheduler threads read tensor_data pointers from task descriptors
// that point into the task descriptor's inline TensorData — freeing early is use-after-free.
while (finished_count_.load(std::memory_order_acquire) < thread_num_ - 1) {
std::this_thread::yield();
SPIN_WAIT_HINT();
Comment thread
ChaoWao marked this conversation as resolved.
}
DEV_INFO("Thread %d: All scheduler threads finished, destroying runtime", thread_idx);

Expand All @@ -1224,7 +1224,7 @@ int AicpuExecutor::run(Runtime* runtime) {
// Device orchestration: wait for Thread 3 to initialize SM header
if (!runtime->get_orch_built_on_host()) {
while (!runtime_init_ready_.load(std::memory_order_acquire)) {
std::this_thread::yield();
SPIN_WAIT_HINT();
Comment thread
ChaoWao marked this conversation as resolved.
}
}
always_assert(rt != nullptr);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,9 @@ PTO2SharedMemoryHandle* pto2_sm_create_from_buffer(void* sm_base,

// Zero task descriptors and dep list pool.
// On multi-round execution the SM buffer may be reused at the same device
// address. PTO2TaskDescriptor contains Tensor members whose move-assignment
// calls TensorPool::deref(old_index). Stale nonzero indices from a previous
// round would corrupt the fresh TensorPool. Zeroing ensures index==0 (no-op
// deref) for every slot before new tasks are written.
// address. Stale fanout_head/fanin_head pointers and fanout_lock atomics
// from a previous round would corrupt the scheduler's dependency tracking.
// Zeroing ensures clean state for every slot before new tasks are written.
memset(handle->task_descriptors, 0,
task_window_size * sizeof(PTO2TaskDescriptor));
// Skip slot 0 (sentinel: task_id=-1, next=nullptr; deref(0) is a no-op)
Expand Down
Loading