Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions docs/dfx/args-dump.md
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,21 @@ SCHEDULER_TIMEOUT_MS (2 s, onboard) < PLATFORM_OP_EXECUTE_TIMEOUT_US (3 s) <
flushes + dumps in-flight and poisons the context and surfaces the error
```

These defaults can be overridden without rebuilding by setting
`PTO2_SCHEDULER_TIMEOUT_MS`, `PTO2_OP_EXECUTE_TIMEOUT_US`, and
`PTO2_STREAM_SYNC_TIMEOUT_MS`. Invalid values, or onboard combinations that
break the ordering above, are ignored with a warning and fall back to the
defaults. The onboard host also requires stream-sync to cover the scheduler
budget plus a 1.5 s scheduler-arming guard for cold init work before the
no-progress timer starts. This guard covers fixed/cold costs such as kernel
registration, orchestration SO dlopen, runtime init, and AICore handshake.
It cannot know the graph-specific maximum orchestration producer wall time, so
callers that raise scheduler/op timeouts must also size
`PTO2_STREAM_SYNC_TIMEOUT_MS` for their worst-case orchestration window. Sim
builds do not have STARS or ACL stream-sync timeouts, but scheduler overrides
are still parsed and applied independently so slow CPU-sim kernels can raise
the no-progress budget without onboard-only ordering limits.

- **Device-side graceful flush (primary).** At 2 s of no progress
the AICPU declares the hang, runs the end-of-loop flush, *and*
dumps the **partial output** of every task still RUNNING on a core
Expand Down Expand Up @@ -953,8 +968,8 @@ only recover what was already in the buffer. The chain lives in
`spin_hint.h` (`PLATFORM_SCHEDULER_TIMEOUT_MS`, surfaced as
`SCHEDULER_TIMEOUT_MS` — 2 s onboard, 5 s in sim where there is no STARS to
race) and `platform_config.h` (`PLATFORM_OP_EXECUTE_TIMEOUT_US` /
`PLATFORM_STREAM_SYNC_TIMEOUT_MS`), along with the `#897` distributed-skew
trade-off.
`PLATFORM_STREAM_SYNC_TIMEOUT_MS`). The env overrides use those constants as
their unset fallback and keep the `#897` distributed-skew trade-off.

## 9. Related docs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _build_chip_callable(platform: str) -> ChipCallable:
signature=[],
func_name="l3_l2_orch_comm_orchestration",
binary=orch,
children=[(0, CoreCallable.build(signature=[D.IN, D.OUT], binary=aiv))],
children=[(0, CoreCallable.build(signature=[D.IN, D.OUT], arg_index=[0, 1], binary=aiv))],
)


Expand Down
18 changes: 14 additions & 4 deletions src/a2a3/platform/include/common/platform_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,28 @@ constexpr int PLATFORM_MAX_AICPU_THREADS = 4;
constexpr int PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH = 6;

/**
* AICore op execution timeout (microseconds).
* Default AICore op execution timeout (microseconds).
* Passed to aclrtSetOpExecuteTimeOutV2 so that STARS actively monitors
* AICore task execution and kills ops that exceed this threshold.
* Overridden at runtime by PTO2_OP_EXECUTE_TIMEOUT_US when that env var
* is valid.
*/
constexpr uint64_t PLATFORM_OP_EXECUTE_TIMEOUT_US = 3000000; // 3s

/**
* Host-side stream synchronization timeout (milliseconds).
* Default onboard AICPU scheduler no-progress timeout (milliseconds).
* Shared with host-side timeout ordering validation; sim keeps its own
* wider budget in spin_hint.h because there is no STARS timeout to race.
*/
constexpr int32_t PLATFORM_ONBOARD_SCHEDULER_TIMEOUT_MS = 2000;

/**
* Default host-side stream synchronization timeout (milliseconds).
* Passed to aclrtSynchronizeStreamWithTimeout to detect stream sync hangs.
* Must be longer than PLATFORM_OP_EXECUTE_TIMEOUT_US so the host waits for
* STARS to reap the timed-out op and surface the error, rather than giving up
* first.
* STARS to reap the timed-out op and surface the error, rather than giving
* up first. Overridden at runtime by PTO2_STREAM_SYNC_TIMEOUT_MS when that
* env var is valid.
*/
constexpr int PLATFORM_STREAM_SYNC_TIMEOUT_MS = 4000; // 4s (> op-exec 3s)

Expand Down
4 changes: 2 additions & 2 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ void DeviceRunner::recover_device_or_mark_unusable(int aicore_rc) {
// force reset (a soft reset/drain does not), so always mark the runner
// unusable here: run() fails fast and finalize() force-resets the card, so
// the next Worker.init lands clean regardless of what the drain reported.
int sync_rc = aclrtSynchronizeDeviceWithTimeout(PLATFORM_STREAM_SYNC_TIMEOUT_MS);
int sync_rc = aclrtSynchronizeDeviceWithTimeout(timeout_config_.stream_sync_timeout_ms);
if (sync_rc != ACL_SUCCESS) {
LOG_ERROR(
"AICore error %d: bounded device drain failed: %d (force reset will follow in finalize)", aicore_rc, sync_rc
Expand Down Expand Up @@ -623,7 +623,7 @@ int DeviceRunner::force_reset_device() {
LOG_ERROR("force_reset_device: could not bind device %d; reset skipped", device_id_);
return -1;
}
(void)aclrtSynchronizeDeviceWithTimeout(PLATFORM_STREAM_SYNC_TIMEOUT_MS);
(void)aclrtSynchronizeDeviceWithTimeout(timeout_config_.stream_sync_timeout_ms);
aclError rc = aclrtResetDeviceForce(device_id_);
if (rc != ACL_SUCCESS) {
LOG_ERROR("force_reset_device: aclrtResetDeviceForce(%d) failed: %d", device_id_, static_cast<int>(rc));
Expand Down
29 changes: 29 additions & 0 deletions src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
#include "callable.h"
#include "common/platform_config.h"
#include "common/unified_log.h"
#include "host/platform_compile_info.h"
#include "host/runtime_timeout_config.h"
#include "utils/device_arena.h"
#include "prepare_callable_common.h"

Expand Down Expand Up @@ -243,6 +245,32 @@ static bool resolve_ring_config(
return true;
}

static int32_t resolve_scheduler_timeout_ms() {
RuntimeTimeoutParseStatus parse_status;
RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config(
RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status
);
if (!parse_status.scheduler_env_set) {
return 0;
}
if (!parse_status.scheduler_valid) {
const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV);
LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env);
return 0;
}

RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform());
if (status != RuntimeTimeoutOrderStatus::OK) {
LOG_WARN(
"Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV,
cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status),
(unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms
);
return 0;
}
return cfg.scheduler_timeout_ms;
}

static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
if (runtime == nullptr || host_header == nullptr) {
return 0;
Expand Down Expand Up @@ -459,6 +487,7 @@ extern "C" int bind_callable_to_runtime_impl(
DeviceArena host_arena; // libc malloc backend by default
PTO2RuntimeArenaLayout layout =
runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities);
layout.scheduler_timeout_ms = resolve_scheduler_timeout_ms();
if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
return -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ struct PTO2RuntimeArenaLayout {
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
int32_t scheduler_timeout_ms{0};

// Total arena byte size post-commit. Used by host to size the prebuilt
// image buffer and as the rtMemcpy length.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,11 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
// "now" so the first budget cycle starts when this thread does, not at
// an undefined value.
uint64_t last_progress_ts = get_sys_cnt_aicpu();
uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES;
if (rt_ != nullptr && rt_->prebuilt_layout.scheduler_timeout_ms > 0) {
scheduler_timeout_cycles =
static_cast<uint64_t>(rt_->prebuilt_layout.scheduler_timeout_ms) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
}

while (true) {
if (completed_.load(std::memory_order_acquire)) {
Expand Down Expand Up @@ -1369,7 +1374,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
// case) — refresh last_progress_ts and keep spinning. The
// STALL diagnostic above still fires periodically so
// observability is preserved.
if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
if (get_sys_cnt_aicpu() - last_progress_ts > scheduler_timeout_cycles) {
bool self_owns = self_owns_running_task(thread_idx);
bool global_stuck = !self_owns && total_tasks_ > 0 &&
completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
Expand Down
18 changes: 14 additions & 4 deletions src/a5/platform/include/common/platform_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,18 +87,28 @@ constexpr int PLATFORM_MAX_AICPU_THREADS = 7;
constexpr int PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH = 14;

/**
* AICore op execution timeout (microseconds).
* Default AICore op execution timeout (microseconds).
* Passed to aclrtSetOpExecuteTimeOutV2 so that STARS actively monitors
* AICore task execution and kills ops that exceed this threshold.
* Overridden at runtime by PTO2_OP_EXECUTE_TIMEOUT_US when that env var
* is valid.
*/
constexpr uint64_t PLATFORM_OP_EXECUTE_TIMEOUT_US = 3000000; // 3s

/**
* Host-side stream synchronization timeout (milliseconds).
* Default onboard AICPU scheduler no-progress timeout (milliseconds).
* Shared with host-side timeout ordering validation; sim keeps its own
* wider budget in spin_hint.h because there is no STARS timeout to race.
*/
constexpr int32_t PLATFORM_ONBOARD_SCHEDULER_TIMEOUT_MS = 2000;

/**
* Default host-side stream synchronization timeout (milliseconds).
* Passed to aclrtSynchronizeStreamWithTimeout to detect stream sync hangs.
* Must be longer than PLATFORM_OP_EXECUTE_TIMEOUT_US so the host waits for
* STARS to reap the timed-out op and surface the error, rather than giving up
* first.
* STARS to reap the timed-out op and surface the error, rather than giving
* up first. Overridden at runtime by PTO2_STREAM_SYNC_TIMEOUT_MS when that
* env var is valid.
*/
constexpr int PLATFORM_STREAM_SYNC_TIMEOUT_MS = 4000; // 4s (> op-exec 3s)

Expand Down
4 changes: 2 additions & 2 deletions src/a5/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ void DeviceRunner::recover_device_or_mark_unusable(int aicore_rc) {
// only cleared by a force reset (a soft reset/drain does not), so always mark
// the runner unusable here: run() fails fast and finalize() force-resets the
// card, so the next Worker.init lands clean regardless of the drain result.
int sync_rc = aclrtSynchronizeDeviceWithTimeout(PLATFORM_STREAM_SYNC_TIMEOUT_MS);
int sync_rc = aclrtSynchronizeDeviceWithTimeout(timeout_config_.stream_sync_timeout_ms);
if (sync_rc != ACL_SUCCESS) {
LOG_ERROR(
"AICore error %d: bounded device drain failed: %d (force reset will follow in finalize)", aicore_rc, sync_rc
Expand Down Expand Up @@ -559,7 +559,7 @@ int DeviceRunner::force_reset_device() {
LOG_ERROR("force_reset_device: could not bind device %d; reset skipped", device_id_);
return -1;
}
(void)aclrtSynchronizeDeviceWithTimeout(PLATFORM_STREAM_SYNC_TIMEOUT_MS);
(void)aclrtSynchronizeDeviceWithTimeout(timeout_config_.stream_sync_timeout_ms);
aclError rc = aclrtResetDeviceForce(device_id_);
if (rc != ACL_SUCCESS) {
LOG_ERROR("force_reset_device: aclrtResetDeviceForce(%d) failed: %d", device_id_, static_cast<int>(rc));
Expand Down
29 changes: 29 additions & 0 deletions src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
#include "callable.h"
#include "common/platform_config.h"
#include "common/unified_log.h"
#include "host/platform_compile_info.h"
#include "host/runtime_timeout_config.h"
#include "prepare_callable_common.h"

static_assert(
Expand Down Expand Up @@ -243,6 +245,32 @@ static bool resolve_ring_config(
return true;
}

static int32_t resolve_scheduler_timeout_ms() {
RuntimeTimeoutParseStatus parse_status;
RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config(
RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status
);
if (!parse_status.scheduler_env_set) {
return 0;
}
if (!parse_status.scheduler_valid) {
const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV);
LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env);
return 0;
}

RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform());
if (status != RuntimeTimeoutOrderStatus::OK) {
LOG_WARN(
"Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV,
cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status),
(unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms
);
return 0;
}
return cfg.scheduler_timeout_ms;
}

static int32_t read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
if (runtime == nullptr || host_header == nullptr) {
return 0;
Expand Down Expand Up @@ -459,6 +487,7 @@ extern "C" int bind_callable_to_runtime_impl(
DeviceArena host_arena; // libc malloc backend by default
PTO2RuntimeArenaLayout layout =
runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities);
layout.scheduler_timeout_ms = resolve_scheduler_timeout_ms();
if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
return -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ struct PTO2RuntimeArenaLayout {
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
int32_t scheduler_timeout_ms{0};

// Total arena byte size post-commit. Used by host to size the prebuilt
// image buffer and as the rtMemcpy length.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,11 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
// "now" so the first budget cycle starts when this thread does, not at
// an undefined value.
uint64_t last_progress_ts = get_sys_cnt_aicpu();
uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES;
if (rt_ != nullptr && rt_->prebuilt_layout.scheduler_timeout_ms > 0) {
scheduler_timeout_cycles =
static_cast<uint64_t>(rt_->prebuilt_layout.scheduler_timeout_ms) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
}

while (true) {
if (completed_.load(std::memory_order_acquire)) {
Expand Down Expand Up @@ -891,7 +896,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
// case) — refresh last_progress_ts and keep spinning. The
// STALL diagnostic above still fires periodically so
// observability is preserved.
if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
if (get_sys_cnt_aicpu() - last_progress_ts > scheduler_timeout_cycles) {
bool self_owns = self_owns_running_task(thread_idx);
bool global_stuck = !self_owns && total_tasks_ > 0 &&
completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
Expand Down
Loading
Loading