From 5131faa02e2f929c9fe42c6e27e3a05742bead9a Mon Sep 17 00:00:00 2001 From: sunkaixuan2018 Date: Fri, 26 Jun 2026 11:22:49 +0800 Subject: [PATCH] Add: configure runtime timeouts from env - Add runtime env overrides for op-execute, stream-sync, and scheduler timeouts - Keep onboard timeout ordering checks so scheduler fires before host timeouts - Let sim scheduler overrides skip onboard-only op/stream ordering limits - Add timeout parsing and platform-ordering unit coverage --- docs/dfx/args-dump.md | 19 +- .../l3_l2_orch_comm_stream.py | 2 +- .../platform/include/common/platform_config.h | 18 +- .../platform/onboard/host/device_runner.cpp | 4 +- .../host/runtime_maker.cpp | 29 +++ .../runtime/pto_runtime2.h | 1 + .../runtime/scheduler/scheduler_dispatch.cpp | 7 +- .../platform/include/common/platform_config.h | 18 +- .../platform/onboard/host/device_runner.cpp | 4 +- .../host/runtime_maker.cpp | 29 +++ .../runtime/pto_runtime2.h | 1 + .../runtime/scheduler/scheduler_dispatch.cpp | 7 +- .../include/host/runtime_timeout_config.h | 227 ++++++++++++++++++ src/common/platform/onboard/aicpu/spin_hint.h | 7 +- .../onboard/host/device_runner_base.cpp | 60 ++++- .../onboard/host/device_runner_base.h | 10 +- src/common/platform/sim/aicpu/spin_hint.h | 3 +- .../l3_l2_orch_comm/test_l3_l2_orch_comm.py | 2 +- tests/ut/cpp/CMakeLists.txt | 2 + .../common/test_runtime_timeout_config.cpp | 192 +++++++++++++++ 20 files changed, 610 insertions(+), 32 deletions(-) create mode 100644 src/common/platform/include/host/runtime_timeout_config.h create mode 100644 tests/ut/cpp/common/test_runtime_timeout_config.cpp diff --git a/docs/dfx/args-dump.md b/docs/dfx/args-dump.md index 0e77e6711..e009eb301 100644 --- a/docs/dfx/args-dump.md +++ b/docs/dfx/args-dump.md @@ -923,6 +923,21 @@ SCHEDULER_TIMEOUT_MS (2 s, onboard) < PLATFORM_OP_EXECUTE_TIMEOUT_US (3 s) < flushes + dumps in-flight and poisons the context and surfaces the error ``` +These defaults can be overridden without rebuilding by setting +`PTO2_SCHEDULER_TIMEOUT_MS`, `PTO2_OP_EXECUTE_TIMEOUT_US`, and +`PTO2_STREAM_SYNC_TIMEOUT_MS`. Invalid values, or onboard combinations that +break the ordering above, are ignored with a warning and fall back to the +defaults. The onboard host also requires stream-sync to cover the scheduler +budget plus a 1.5 s scheduler-arming guard for cold init work before the +no-progress timer starts. This guard covers fixed/cold costs such as kernel +registration, orchestration SO dlopen, runtime init, and AICore handshake. +It cannot know the graph-specific maximum orchestration producer wall time, so +callers that raise scheduler/op timeouts must also size +`PTO2_STREAM_SYNC_TIMEOUT_MS` for their worst-case orchestration window. Sim +builds do not have STARS or ACL stream-sync timeouts, but scheduler overrides +are still parsed and applied independently so slow CPU-sim kernels can raise +the no-progress budget without onboard-only ordering limits. + - **Device-side graceful flush (primary).** At 2 s of no progress the AICPU declares the hang, runs the end-of-loop flush, *and* dumps the **partial output** of every task still RUNNING on a core @@ -953,8 +968,8 @@ only recover what was already in the buffer. The chain lives in `spin_hint.h` (`PLATFORM_SCHEDULER_TIMEOUT_MS`, surfaced as `SCHEDULER_TIMEOUT_MS` — 2 s onboard, 5 s in sim where there is no STARS to race) and `platform_config.h` (`PLATFORM_OP_EXECUTE_TIMEOUT_US` / -`PLATFORM_STREAM_SYNC_TIMEOUT_MS`), along with the `#897` distributed-skew -trade-off. +`PLATFORM_STREAM_SYNC_TIMEOUT_MS`). The env overrides use those constants as +their unset fallback and keep the `#897` distributed-skew trade-off. ## 9. Related docs diff --git a/examples/a2a3/tensormap_and_ringbuffer/l3_l2_orch_comm_stream/l3_l2_orch_comm_stream.py b/examples/a2a3/tensormap_and_ringbuffer/l3_l2_orch_comm_stream/l3_l2_orch_comm_stream.py index 0feeab4a2..555edb4a3 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/l3_l2_orch_comm_stream/l3_l2_orch_comm_stream.py +++ b/examples/a2a3/tensormap_and_ringbuffer/l3_l2_orch_comm_stream/l3_l2_orch_comm_stream.py @@ -66,7 +66,7 @@ def _build_chip_callable(platform: str) -> ChipCallable: signature=[], func_name="l3_l2_orch_comm_orchestration", binary=orch, - children=[(0, CoreCallable.build(signature=[D.IN, D.OUT], binary=aiv))], + children=[(0, CoreCallable.build(signature=[D.IN, D.OUT], arg_index=[0, 1], binary=aiv))], ) diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h index 45a7c75cb..b08dfb91e 100644 --- a/src/a2a3/platform/include/common/platform_config.h +++ b/src/a2a3/platform/include/common/platform_config.h @@ -60,18 +60,28 @@ constexpr int PLATFORM_MAX_AICPU_THREADS = 4; constexpr int PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH = 6; /** - * AICore op execution timeout (microseconds). + * Default AICore op execution timeout (microseconds). * Passed to aclrtSetOpExecuteTimeOutV2 so that STARS actively monitors * AICore task execution and kills ops that exceed this threshold. + * Overridden at runtime by PTO2_OP_EXECUTE_TIMEOUT_US when that env var + * is valid. */ constexpr uint64_t PLATFORM_OP_EXECUTE_TIMEOUT_US = 3000000; // 3s /** - * Host-side stream synchronization timeout (milliseconds). + * Default onboard AICPU scheduler no-progress timeout (milliseconds). + * Shared with host-side timeout ordering validation; sim keeps its own + * wider budget in spin_hint.h because there is no STARS timeout to race. + */ +constexpr int32_t PLATFORM_ONBOARD_SCHEDULER_TIMEOUT_MS = 2000; + +/** + * Default host-side stream synchronization timeout (milliseconds). * Passed to aclrtSynchronizeStreamWithTimeout to detect stream sync hangs. * Must be longer than PLATFORM_OP_EXECUTE_TIMEOUT_US so the host waits for - * STARS to reap the timed-out op and surface the error, rather than giving up - * first. + * STARS to reap the timed-out op and surface the error, rather than giving + * up first. Overridden at runtime by PTO2_STREAM_SYNC_TIMEOUT_MS when that + * env var is valid. */ constexpr int PLATFORM_STREAM_SYNC_TIMEOUT_MS = 4000; // 4s (> op-exec 3s) diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 9ec40dbea..c0d0866dd 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -518,7 +518,7 @@ void DeviceRunner::recover_device_or_mark_unusable(int aicore_rc) { // force reset (a soft reset/drain does not), so always mark the runner // unusable here: run() fails fast and finalize() force-resets the card, so // the next Worker.init lands clean regardless of what the drain reported. - int sync_rc = aclrtSynchronizeDeviceWithTimeout(PLATFORM_STREAM_SYNC_TIMEOUT_MS); + int sync_rc = aclrtSynchronizeDeviceWithTimeout(timeout_config_.stream_sync_timeout_ms); if (sync_rc != ACL_SUCCESS) { LOG_ERROR( "AICore error %d: bounded device drain failed: %d (force reset will follow in finalize)", aicore_rc, sync_rc @@ -623,7 +623,7 @@ int DeviceRunner::force_reset_device() { LOG_ERROR("force_reset_device: could not bind device %d; reset skipped", device_id_); return -1; } - (void)aclrtSynchronizeDeviceWithTimeout(PLATFORM_STREAM_SYNC_TIMEOUT_MS); + (void)aclrtSynchronizeDeviceWithTimeout(timeout_config_.stream_sync_timeout_ms); aclError rc = aclrtResetDeviceForce(device_id_); if (rc != ACL_SUCCESS) { LOG_ERROR("force_reset_device: aclrtResetDeviceForce(%d) failed: %d", device_id_, static_cast(rc)); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index b49b73ce8..e16dc5fd2 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -48,6 +48,8 @@ #include "callable.h" #include "common/platform_config.h" #include "common/unified_log.h" +#include "host/platform_compile_info.h" +#include "host/runtime_timeout_config.h" #include "utils/device_arena.h" #include "prepare_callable_common.h" @@ -243,6 +245,32 @@ static bool resolve_ring_config( return true; } +static int32_t resolve_scheduler_timeout_ms() { + RuntimeTimeoutParseStatus parse_status; + RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config( + RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status + ); + if (!parse_status.scheduler_env_set) { + return 0; + } + if (!parse_status.scheduler_valid) { + const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV); + LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env); + return 0; + } + + RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform()); + if (status != RuntimeTimeoutOrderStatus::OK) { + LOG_WARN( + "Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV, + cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status), + (unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms + ); + return 0; + } + return cfg.scheduler_timeout_ms; +} + static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) { if (runtime == nullptr || host_header == nullptr) { return 0; @@ -459,6 +487,7 @@ extern "C" int bind_callable_to_runtime_impl( DeviceArena host_arena; // libc malloc backend by default PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities); + layout.scheduler_timeout_ms = resolve_scheduler_timeout_ms(); if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); return -1; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 85680d8c3..ad5537536 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -113,6 +113,7 @@ struct PTO2RuntimeArenaLayout { uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{}; uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{}; int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{}; + int32_t scheduler_timeout_ms{0}; // Total arena byte size post-commit. Used by host to size the prebuilt // image buffer and as the rtMemcpy length. diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index a4260b4d2..37b3d7c49 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -878,6 +878,11 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // "now" so the first budget cycle starts when this thread does, not at // an undefined value. uint64_t last_progress_ts = get_sys_cnt_aicpu(); + uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES; + if (rt_ != nullptr && rt_->prebuilt_layout.scheduler_timeout_ms > 0) { + scheduler_timeout_cycles = + static_cast(rt_->prebuilt_layout.scheduler_timeout_ms) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000); + } while (true) { if (completed_.load(std::memory_order_acquire)) { @@ -1369,7 +1374,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // case) — refresh last_progress_ts and keep spinning. The // STALL diagnostic above still fires periodically so // observability is preserved. - if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) { + if (get_sys_cnt_aicpu() - last_progress_ts > scheduler_timeout_cycles) { bool self_owns = self_owns_running_task(thread_idx); bool global_stuck = !self_owns && total_tasks_ > 0 && completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ && diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h index b728eff5e..f859d51b6 100644 --- a/src/a5/platform/include/common/platform_config.h +++ b/src/a5/platform/include/common/platform_config.h @@ -87,18 +87,28 @@ constexpr int PLATFORM_MAX_AICPU_THREADS = 7; constexpr int PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH = 14; /** - * AICore op execution timeout (microseconds). + * Default AICore op execution timeout (microseconds). * Passed to aclrtSetOpExecuteTimeOutV2 so that STARS actively monitors * AICore task execution and kills ops that exceed this threshold. + * Overridden at runtime by PTO2_OP_EXECUTE_TIMEOUT_US when that env var + * is valid. */ constexpr uint64_t PLATFORM_OP_EXECUTE_TIMEOUT_US = 3000000; // 3s /** - * Host-side stream synchronization timeout (milliseconds). + * Default onboard AICPU scheduler no-progress timeout (milliseconds). + * Shared with host-side timeout ordering validation; sim keeps its own + * wider budget in spin_hint.h because there is no STARS timeout to race. + */ +constexpr int32_t PLATFORM_ONBOARD_SCHEDULER_TIMEOUT_MS = 2000; + +/** + * Default host-side stream synchronization timeout (milliseconds). * Passed to aclrtSynchronizeStreamWithTimeout to detect stream sync hangs. * Must be longer than PLATFORM_OP_EXECUTE_TIMEOUT_US so the host waits for - * STARS to reap the timed-out op and surface the error, rather than giving up - * first. + * STARS to reap the timed-out op and surface the error, rather than giving + * up first. Overridden at runtime by PTO2_STREAM_SYNC_TIMEOUT_MS when that + * env var is valid. */ constexpr int PLATFORM_STREAM_SYNC_TIMEOUT_MS = 4000; // 4s (> op-exec 3s) diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index b2b0af478..ef03cf9b3 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -453,7 +453,7 @@ void DeviceRunner::recover_device_or_mark_unusable(int aicore_rc) { // only cleared by a force reset (a soft reset/drain does not), so always mark // the runner unusable here: run() fails fast and finalize() force-resets the // card, so the next Worker.init lands clean regardless of the drain result. - int sync_rc = aclrtSynchronizeDeviceWithTimeout(PLATFORM_STREAM_SYNC_TIMEOUT_MS); + int sync_rc = aclrtSynchronizeDeviceWithTimeout(timeout_config_.stream_sync_timeout_ms); if (sync_rc != ACL_SUCCESS) { LOG_ERROR( "AICore error %d: bounded device drain failed: %d (force reset will follow in finalize)", aicore_rc, sync_rc @@ -559,7 +559,7 @@ int DeviceRunner::force_reset_device() { LOG_ERROR("force_reset_device: could not bind device %d; reset skipped", device_id_); return -1; } - (void)aclrtSynchronizeDeviceWithTimeout(PLATFORM_STREAM_SYNC_TIMEOUT_MS); + (void)aclrtSynchronizeDeviceWithTimeout(timeout_config_.stream_sync_timeout_ms); aclError rc = aclrtResetDeviceForce(device_id_); if (rc != ACL_SUCCESS) { LOG_ERROR("force_reset_device: aclrtResetDeviceForce(%d) failed: %d", device_id_, static_cast(rc)); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 8fe4c6c6c..e5af5bb3e 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -49,6 +49,8 @@ #include "callable.h" #include "common/platform_config.h" #include "common/unified_log.h" +#include "host/platform_compile_info.h" +#include "host/runtime_timeout_config.h" #include "prepare_callable_common.h" static_assert( @@ -243,6 +245,32 @@ static bool resolve_ring_config( return true; } +static int32_t resolve_scheduler_timeout_ms() { + RuntimeTimeoutParseStatus parse_status; + RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config( + RuntimeTimeoutConfig{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, 0}, &parse_status + ); + if (!parse_status.scheduler_env_set) { + return 0; + } + if (!parse_status.scheduler_valid) { + const char *env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV); + LOG_WARN("%s=%s invalid, using platform scheduler timeout", PTO2_SCHEDULER_TIMEOUT_MS_ENV, env); + return 0; + } + + RuntimeTimeoutOrderStatus status = validate_runtime_timeout_order_for_platform(cfg, get_platform()); + if (status != RuntimeTimeoutOrderStatus::OK) { + LOG_WARN( + "Ignoring %s=%d: %s (op_execute=%llu us, stream_sync=%d ms)", PTO2_SCHEDULER_TIMEOUT_MS_ENV, + cfg.scheduler_timeout_ms, runtime_timeout_order_status_name(status), + (unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms + ); + return 0; + } + return cfg.scheduler_timeout_ms; +} + static int32_t read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) { if (runtime == nullptr || host_header == nullptr) { return 0; @@ -459,6 +487,7 @@ extern "C" int bind_callable_to_runtime_impl( DeviceArena host_arena; // libc malloc backend by default PTO2RuntimeArenaLayout layout = runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities); + layout.scheduler_timeout_ms = resolve_scheduler_timeout_ms(); if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); return -1; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index db4af47ed..156e0eafa 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -114,6 +114,7 @@ struct PTO2RuntimeArenaLayout { uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{}; uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{}; int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{}; + int32_t scheduler_timeout_ms{0}; // Total arena byte size post-commit. Used by host to size the prebuilt // image buffer and as the rtMemcpy length. diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index e4ac29766..8f994088f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -589,6 +589,11 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // "now" so the first budget cycle starts when this thread does, not at // an undefined value. uint64_t last_progress_ts = get_sys_cnt_aicpu(); + uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES; + if (rt_ != nullptr && rt_->prebuilt_layout.scheduler_timeout_ms > 0) { + scheduler_timeout_cycles = + static_cast(rt_->prebuilt_layout.scheduler_timeout_ms) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000); + } while (true) { if (completed_.load(std::memory_order_acquire)) { @@ -891,7 +896,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ // case) — refresh last_progress_ts and keep spinning. The // STALL diagnostic above still fires periodically so // observability is preserved. - if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) { + if (get_sys_cnt_aicpu() - last_progress_ts > scheduler_timeout_cycles) { bool self_owns = self_owns_running_task(thread_idx); bool global_stuck = !self_owns && total_tasks_ > 0 && completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ && diff --git a/src/common/platform/include/host/runtime_timeout_config.h b/src/common/platform/include/host/runtime_timeout_config.h new file mode 100644 index 000000000..a6cef3e33 --- /dev/null +++ b/src/common/platform/include/host/runtime_timeout_config.h @@ -0,0 +1,227 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SIMPLER_COMMON_PLATFORM_INCLUDE_HOST_RUNTIME_TIMEOUT_CONFIG_H +#define SIMPLER_COMMON_PLATFORM_INCLUDE_HOST_RUNTIME_TIMEOUT_CONFIG_H + +#include +#include +#include +#include +#include +#include +#include + +constexpr const char *PTO2_OP_EXECUTE_TIMEOUT_US_ENV = "PTO2_OP_EXECUTE_TIMEOUT_US"; +constexpr const char *PTO2_STREAM_SYNC_TIMEOUT_MS_ENV = "PTO2_STREAM_SYNC_TIMEOUT_MS"; +constexpr const char *PTO2_SCHEDULER_TIMEOUT_MS_ENV = "PTO2_SCHEDULER_TIMEOUT_MS"; + +// Covers the host stream-sync window before the AICPU scheduler no-progress +// timer is armed: cold kernel registration, orchestration SO dlopen, runtime +// init, and AICore handshake. The host cannot know the later orchestration +// wall-clock maximum from env parsing alone; callers must size stream-sync for +// that graph-specific producer window. +constexpr int32_t RUNTIME_TIMEOUT_SCHEDULER_ARMING_GUARD_MS = 1500; + +struct RuntimeTimeoutConfig { + uint64_t op_execute_timeout_us; + int32_t stream_sync_timeout_ms; + int32_t scheduler_timeout_ms; +}; + +struct HostRuntimeTimeoutConfig { + uint64_t op_execute_timeout_us; + int32_t stream_sync_timeout_ms; +}; + +struct RuntimeTimeoutParseStatus { + bool op_execute_env_set{false}; + bool op_execute_valid{true}; + bool stream_sync_env_set{false}; + bool stream_sync_valid{true}; + bool scheduler_env_set{false}; + bool scheduler_valid{true}; +}; + +enum class RuntimeTimeoutOrderStatus { + OK, + SCHEDULER_NOT_BELOW_OP_EXECUTE, + OP_EXECUTE_NOT_BELOW_STREAM_SYNC, + STREAM_SYNC_NOT_COVERING_SCHEDULER_GUARD, +}; + +inline const char *runtime_timeout_order_status_name(RuntimeTimeoutOrderStatus status) { + switch (status) { + case RuntimeTimeoutOrderStatus::OK: + return "OK"; + case RuntimeTimeoutOrderStatus::SCHEDULER_NOT_BELOW_OP_EXECUTE: + return "scheduler timeout must be below op-execute timeout"; + case RuntimeTimeoutOrderStatus::OP_EXECUTE_NOT_BELOW_STREAM_SYNC: + return "op-execute timeout must be below stream-sync timeout"; + case RuntimeTimeoutOrderStatus::STREAM_SYNC_NOT_COVERING_SCHEDULER_GUARD: + return "stream-sync timeout must cover scheduler timeout plus scheduler-arming guard"; + } + return "unknown timeout ordering error"; +} + +inline std::string trim_runtime_timeout_token(const std::string &input) { + size_t begin = 0; + while (begin < input.size() && std::isspace(static_cast(input[begin]))) { + ++begin; + } + size_t end = input.size(); + while (end > begin && std::isspace(static_cast(input[end - 1]))) { + --end; + } + return input.substr(begin, end - begin); +} + +inline bool parse_runtime_timeout_uint(const char *raw, uint64_t min_value, uint64_t max_value, uint64_t *out_value) { + if (raw == nullptr || out_value == nullptr) { + return false; + } + std::string token = trim_runtime_timeout_token(raw); + if (token.empty() || token[0] == '-') { + return false; + } + + char *endptr = nullptr; + errno = 0; + unsigned long long parsed = std::strtoull(token.c_str(), &endptr, 10); + if (errno == ERANGE || endptr == token.c_str() || *endptr != '\0') { + return false; + } + + uint64_t value = static_cast(parsed); + if (value < min_value || value > max_value) { + return false; + } + + *out_value = value; + return true; +} + +inline bool apply_runtime_timeout_override( + const char * /*name*/, const char *raw, uint64_t min_value, uint64_t max_value, uint64_t *out_value +) { + uint64_t parsed = 0; + if (!parse_runtime_timeout_uint(raw, min_value, max_value, &parsed)) { + return false; + } + *out_value = parsed; + return true; +} + +inline bool apply_runtime_timeout_override( + const char *name, const char *raw, uint64_t min_value, uint64_t max_value, int32_t *out_value +) { + uint64_t parsed = 0; + if (!apply_runtime_timeout_override(name, raw, min_value, max_value, &parsed)) { + return false; + } + if (parsed > static_cast(std::numeric_limits::max())) { + return false; + } + *out_value = static_cast(parsed); + return true; +} + +inline RuntimeTimeoutConfig +resolve_runtime_timeout_config(const RuntimeTimeoutConfig &defaults, RuntimeTimeoutParseStatus *status = nullptr) { + RuntimeTimeoutConfig cfg = defaults; + if (status != nullptr) { + *status = RuntimeTimeoutParseStatus{}; + } + const char *op_env = std::getenv(PTO2_OP_EXECUTE_TIMEOUT_US_ENV); + if (op_env != nullptr) { + if (status != nullptr) status->op_execute_env_set = true; + bool ok = apply_runtime_timeout_override( + PTO2_OP_EXECUTE_TIMEOUT_US_ENV, op_env, 1, std::numeric_limits::max(), &cfg.op_execute_timeout_us + ); + if (status != nullptr) status->op_execute_valid = ok; + } + const char *sync_env = std::getenv(PTO2_STREAM_SYNC_TIMEOUT_MS_ENV); + if (sync_env != nullptr) { + if (status != nullptr) status->stream_sync_env_set = true; + bool ok = apply_runtime_timeout_override( + PTO2_STREAM_SYNC_TIMEOUT_MS_ENV, sync_env, 1, static_cast(std::numeric_limits::max()), + &cfg.stream_sync_timeout_ms + ); + if (status != nullptr) status->stream_sync_valid = ok; + } + const char *sched_env = std::getenv(PTO2_SCHEDULER_TIMEOUT_MS_ENV); + if (sched_env != nullptr) { + if (status != nullptr) status->scheduler_env_set = true; + bool ok = apply_runtime_timeout_override( + PTO2_SCHEDULER_TIMEOUT_MS_ENV, sched_env, 1, static_cast(std::numeric_limits::max()), + &cfg.scheduler_timeout_ms + ); + if (status != nullptr) status->scheduler_valid = ok; + } + return cfg; +} + +inline HostRuntimeTimeoutConfig +resolve_host_runtime_timeout_config(const RuntimeTimeoutConfig &defaults, RuntimeTimeoutParseStatus *status = nullptr) { + HostRuntimeTimeoutConfig cfg{defaults.op_execute_timeout_us, defaults.stream_sync_timeout_ms}; + if (status != nullptr) { + *status = RuntimeTimeoutParseStatus{}; + } + const char *op_env = std::getenv(PTO2_OP_EXECUTE_TIMEOUT_US_ENV); + if (op_env != nullptr) { + if (status != nullptr) status->op_execute_env_set = true; + bool ok = apply_runtime_timeout_override( + PTO2_OP_EXECUTE_TIMEOUT_US_ENV, op_env, 1, std::numeric_limits::max(), &cfg.op_execute_timeout_us + ); + if (status != nullptr) status->op_execute_valid = ok; + } + const char *sync_env = std::getenv(PTO2_STREAM_SYNC_TIMEOUT_MS_ENV); + if (sync_env != nullptr) { + if (status != nullptr) status->stream_sync_env_set = true; + bool ok = apply_runtime_timeout_override( + PTO2_STREAM_SYNC_TIMEOUT_MS_ENV, sync_env, 1, static_cast(std::numeric_limits::max()), + &cfg.stream_sync_timeout_ms + ); + if (status != nullptr) status->stream_sync_valid = ok; + } + return cfg; +} + +inline RuntimeTimeoutOrderStatus validate_runtime_timeout_order(const RuntimeTimeoutConfig &cfg) { + uint64_t scheduler_timeout_us = static_cast(cfg.scheduler_timeout_ms) * 1000; + uint64_t stream_sync_timeout_us = static_cast(cfg.stream_sync_timeout_ms) * 1000; + uint64_t scheduler_guarded_stream_budget_ms = + static_cast(cfg.scheduler_timeout_ms) + RUNTIME_TIMEOUT_SCHEDULER_ARMING_GUARD_MS; + if (scheduler_timeout_us >= cfg.op_execute_timeout_us) { + return RuntimeTimeoutOrderStatus::SCHEDULER_NOT_BELOW_OP_EXECUTE; + } + if (cfg.op_execute_timeout_us >= stream_sync_timeout_us) { + return RuntimeTimeoutOrderStatus::OP_EXECUTE_NOT_BELOW_STREAM_SYNC; + } + if (static_cast(cfg.stream_sync_timeout_ms) <= scheduler_guarded_stream_budget_ms) { + return RuntimeTimeoutOrderStatus::STREAM_SYNC_NOT_COVERING_SCHEDULER_GUARD; + } + return RuntimeTimeoutOrderStatus::OK; +} + +inline bool runtime_timeout_platform_is_sim(const char *platform) { + return platform != nullptr && std::strstr(platform, "sim") != nullptr; +} + +inline RuntimeTimeoutOrderStatus +validate_runtime_timeout_order_for_platform(const RuntimeTimeoutConfig &cfg, const char *platform) { + if (runtime_timeout_platform_is_sim(platform)) { + return RuntimeTimeoutOrderStatus::OK; + } + return validate_runtime_timeout_order(cfg); +} + +#endif // SIMPLER_COMMON_PLATFORM_INCLUDE_HOST_RUNTIME_TIMEOUT_CONFIG_H diff --git a/src/common/platform/onboard/aicpu/spin_hint.h b/src/common/platform/onboard/aicpu/spin_hint.h index 8979fa4b3..5174e8044 100644 --- a/src/common/platform/onboard/aicpu/spin_hint.h +++ b/src/common/platform/onboard/aicpu/spin_hint.h @@ -21,6 +21,8 @@ #include +#include "common/platform_config.h" + #define SPIN_WAIT_HINT() ((void)0) // Wall-clock budget (ms) of no task progress before the dispatch loop aborts @@ -33,7 +35,8 @@ // slow distributed startup can false-latch; if that bites, raise this together // with the op-exec / stream-sync timeouts. The sim build keeps the full 5 s (no // STARS to race). The runtime consumes it as SCHEDULER_TIMEOUT_MS (see -// scheduler_types.h). -constexpr int32_t PLATFORM_SCHEDULER_TIMEOUT_MS = 2000; +// scheduler_types.h). Host may override this per run via +// PTO2_SCHEDULER_TIMEOUT_MS after validating the timeout ordering. +constexpr int32_t PLATFORM_SCHEDULER_TIMEOUT_MS = PLATFORM_ONBOARD_SCHEDULER_TIMEOUT_MS; #endif // PLATFORM_A2A3_AICPU_SPIN_HINT_H_ diff --git a/src/common/platform/onboard/host/device_runner_base.cpp b/src/common/platform/onboard/host/device_runner_base.cpp index bbe37e305..56b194448 100644 --- a/src/common/platform/onboard/host/device_runner_base.cpp +++ b/src/common/platform/onboard/host/device_runner_base.cpp @@ -49,6 +49,50 @@ // `print_handshake_results` / `bind_callable_to_runtime` / // `prepare_orch_so`. +namespace { + +HostRuntimeTimeoutConfig resolve_onboard_timeout_config() { + RuntimeTimeoutConfig order_defaults{ + PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS, PLATFORM_ONBOARD_SCHEDULER_TIMEOUT_MS + }; + HostRuntimeTimeoutConfig defaults{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS}; + RuntimeTimeoutParseStatus parse_status; + HostRuntimeTimeoutConfig cfg = resolve_host_runtime_timeout_config(order_defaults, &parse_status); + + if (parse_status.op_execute_env_set && !parse_status.op_execute_valid) { + const char *op_env = std::getenv(PTO2_OP_EXECUTE_TIMEOUT_US_ENV); + LOG_WARN( + "%s=%s invalid, using default %llu", PTO2_OP_EXECUTE_TIMEOUT_US_ENV, op_env, + (unsigned long long)order_defaults.op_execute_timeout_us + ); + } + + if (parse_status.stream_sync_env_set && !parse_status.stream_sync_valid) { + const char *sync_env = std::getenv(PTO2_STREAM_SYNC_TIMEOUT_MS_ENV); + LOG_WARN( + "%s=%s invalid, using default %d", PTO2_STREAM_SYNC_TIMEOUT_MS_ENV, sync_env, + order_defaults.stream_sync_timeout_ms + ); + } + + bool host_timeout_env_set = parse_status.op_execute_env_set || parse_status.stream_sync_env_set; + RuntimeTimeoutConfig order_cfg{ + cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms, PLATFORM_ONBOARD_SCHEDULER_TIMEOUT_MS + }; + RuntimeTimeoutOrderStatus order_status = validate_runtime_timeout_order(order_cfg); + if (host_timeout_env_set && order_status != RuntimeTimeoutOrderStatus::OK) { + LOG_WARN( + "Ignoring PTO2 timeout env overrides: %s (scheduler=%d ms, op_execute=%llu us, stream_sync=%d ms)", + runtime_timeout_order_status_name(order_status), PLATFORM_ONBOARD_SCHEDULER_TIMEOUT_MS, + (unsigned long long)cfg.op_execute_timeout_us, cfg.stream_sync_timeout_ms + ); + return defaults; + } + return cfg; +} + +} // namespace + DeviceRunnerBase::DeviceRunnerBase() : gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), @@ -238,6 +282,7 @@ int DeviceRunnerBase::attach_current_thread(int device_id) { } if (device_id_ == -1) { + timeout_config_ = resolve_onboard_timeout_config(); configure_aicore_op_timeout(); } @@ -247,15 +292,16 @@ int DeviceRunnerBase::attach_current_thread(int device_id) { void DeviceRunnerBase::configure_aicore_op_timeout() { uint64_t actual_timeout = 0; - int rc = aclrtSetOpExecuteTimeOutV2(PLATFORM_OP_EXECUTE_TIMEOUT_US, &actual_timeout); + int rc = aclrtSetOpExecuteTimeOutV2(timeout_config_.op_execute_timeout_us, &actual_timeout); if (rc != 0) { LOG_ERROR( - "aclrtSetOpExecuteTimeOutV2(%llu us) failed: %d", (unsigned long long)PLATFORM_OP_EXECUTE_TIMEOUT_US, rc + "aclrtSetOpExecuteTimeOutV2(%llu us) failed: %d", (unsigned long long)timeout_config_.op_execute_timeout_us, + rc ); } else { LOG_INFO_V0( "aclrtSetOpExecuteTimeOutV2: requested=%llu us, actual=%llu us", - (unsigned long long)PLATFORM_OP_EXECUTE_TIMEOUT_US, (unsigned long long)actual_timeout + (unsigned long long)timeout_config_.op_execute_timeout_us, (unsigned long long)actual_timeout ); } } @@ -1008,11 +1054,11 @@ int DeviceRunnerBase::prepare_runtime_for_launch(Runtime &runtime, int block_dim int DeviceRunnerBase::sync_run_streams() { LOG_INFO_V0("=== aclrtSynchronizeStreamWithTimeout stream_aicpu_ ==="); - int rc = aclrtSynchronizeStreamWithTimeout(stream_aicpu_, PLATFORM_STREAM_SYNC_TIMEOUT_MS); + int rc = aclrtSynchronizeStreamWithTimeout(stream_aicpu_, timeout_config_.stream_sync_timeout_ms); if (rc == ACL_ERROR_RT_STREAM_SYNC_TIMEOUT) { LOG_ERROR( "Stream sync timeout: stream=AICPU timeout_ms=%d device_id=%d block_dim=%d", - PLATFORM_STREAM_SYNC_TIMEOUT_MS, device_id_, block_dim_ + timeout_config_.stream_sync_timeout_ms, device_id_, block_dim_ ); return rc; } @@ -1022,11 +1068,11 @@ int DeviceRunnerBase::sync_run_streams() { } LOG_INFO_V0("=== aclrtSynchronizeStreamWithTimeout stream_aicore_ ==="); - rc = aclrtSynchronizeStreamWithTimeout(stream_aicore_, PLATFORM_STREAM_SYNC_TIMEOUT_MS); + rc = aclrtSynchronizeStreamWithTimeout(stream_aicore_, timeout_config_.stream_sync_timeout_ms); if (rc == ACL_ERROR_RT_STREAM_SYNC_TIMEOUT) { LOG_ERROR( "Stream sync timeout: stream=AICore timeout_ms=%d device_id=%d block_dim=%d", - PLATFORM_STREAM_SYNC_TIMEOUT_MS, device_id_, block_dim_ + timeout_config_.stream_sync_timeout_ms, device_id_, block_dim_ ); return rc; } diff --git a/src/common/platform/onboard/host/device_runner_base.h b/src/common/platform/onboard/host/device_runner_base.h index c844d2546..d9da79fed 100644 --- a/src/common/platform/onboard/host/device_runner_base.h +++ b/src/common/platform/onboard/host/device_runner_base.h @@ -58,6 +58,7 @@ #include "host/l2_swimlane_collector.h" #include "host/memory_allocator.h" #include "host/pmu_collector.h" +#include "host/runtime_timeout_config.h" #include "host/scope_stats_collector.h" #include "host/tensor_dump_collector.h" #include "prepare_callable_common.h" @@ -532,10 +533,10 @@ class DeviceRunnerBase : public L3L2OrchCommBackend { /** * Wait for both per-Worker streams (AICPU first, then AICore) with - * `PLATFORM_STREAM_SYNC_TIMEOUT_MS`. Distinguishes the timeout - * sentinel `ACL_ERROR_RT_STREAM_SYNC_TIMEOUT` with a stream-id and - * (device, block_dim) context in the log. Returns the first - * non-zero rc encountered. + * the resolved stream-sync timeout. + * Distinguishes the timeout + * sentinel `ACL_ERROR_RT_STREAM_SYNC_TIMEOUT` with a stream-id and (device, + * block_dim) context in the log. Returns the first non-zero rc encountered. */ int sync_run_streams(); @@ -695,6 +696,7 @@ class DeviceRunnerBase : public L3L2OrchCommBackend { int block_dim_{0}; int cores_per_blockdim_{PLATFORM_CORES_PER_BLOCKDIM}; int worker_count_{0}; // Stored for print_handshake_results + HostRuntimeTimeoutConfig timeout_config_{PLATFORM_OP_EXECUTE_TIMEOUT_US, PLATFORM_STREAM_SYNC_TIMEOUT_MS}; // Executor binaries — populated once via `set_executors()` during // simpler_init. `aicore_kernel_binary_` is consumed once by diff --git a/src/common/platform/sim/aicpu/spin_hint.h b/src/common/platform/sim/aicpu/spin_hint.h index 260c3153f..4e01a6260 100644 --- a/src/common/platform/sim/aicpu/spin_hint.h +++ b/src/common/platform/sim/aicpu/spin_hint.h @@ -55,7 +55,8 @@ // STARS reaps). A generous budget also avoids false timeouts when an // oversubscribed CPU-sim kernel (e.g. matmul-heavy) makes real but slow // progress; raise further if a slow kernel still false-times-out. The runtime -// consumes it as SCHEDULER_TIMEOUT_MS (see scheduler_types.h). +// consumes it as SCHEDULER_TIMEOUT_MS (see scheduler_types.h). Host may +// override this per run via PTO2_SCHEDULER_TIMEOUT_MS. constexpr int32_t PLATFORM_SCHEDULER_TIMEOUT_MS = 5000; #endif // PLATFORM_A2A3SIM_AICPU_SPIN_HINT_H_ diff --git a/tests/st/a5/tensormap_and_ringbuffer/l3_l2_orch_comm/test_l3_l2_orch_comm.py b/tests/st/a5/tensormap_and_ringbuffer/l3_l2_orch_comm/test_l3_l2_orch_comm.py index 7449c194d..d5813fd15 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/l3_l2_orch_comm/test_l3_l2_orch_comm.py +++ b/tests/st/a5/tensormap_and_ringbuffer/l3_l2_orch_comm/test_l3_l2_orch_comm.py @@ -67,7 +67,7 @@ def _build_chip_callable(platform: str) -> ChipCallable: signature=[], func_name="l3_l2_orch_comm_orchestration", binary=orch, - children=[(0, CoreCallable.build(signature=[D.IN, D.OUT], binary=aiv))], + children=[(0, CoreCallable.build(signature=[D.IN, D.OUT], arg_index=[0, 1], binary=aiv))], ) diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 5fe6dd186..4f8bae650 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -439,6 +439,8 @@ target_link_libraries(test_l3_l2_orch_comm_sim_runner PRIVATE add_test(NAME test_l3_l2_orch_comm_sim_runner COMMAND test_l3_l2_orch_comm_sim_runner) set_tests_properties(test_l3_l2_orch_comm_sim_runner PROPERTIES LABELS "no_hardware") +add_common_utils_test(test_runtime_timeout_config common/test_runtime_timeout_config.cpp) + add_executable(test_scope_stats_collector common/test_scope_stats_collector.cpp ${CMAKE_SOURCE_DIR}/../../../src/common/platform/shared/host/scope_stats_collector.cpp diff --git a/tests/ut/cpp/common/test_runtime_timeout_config.cpp b/tests/ut/cpp/common/test_runtime_timeout_config.cpp new file mode 100644 index 000000000..cd5020a1c --- /dev/null +++ b/tests/ut/cpp/common/test_runtime_timeout_config.cpp @@ -0,0 +1,192 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include +#include + +#include + +#include "host/runtime_timeout_config.h" + +namespace { + +constexpr RuntimeTimeoutConfig kDefaults{3000000, 4000, 2000}; + +void set_env_var(const char *name, const char *value) { +#if defined(_WIN32) + _putenv_s(name, value); +#else + setenv(name, value, 1); +#endif +} + +void unset_env_var(const char *name) { +#if defined(_WIN32) + _putenv_s(name, ""); +#else + unsetenv(name); +#endif +} + +class ScopedUnsetTimeoutEnv { +public: + ScopedUnsetTimeoutEnv() { + save(PTO2_OP_EXECUTE_TIMEOUT_US_ENV, op_); + save(PTO2_STREAM_SYNC_TIMEOUT_MS_ENV, stream_); + save(PTO2_SCHEDULER_TIMEOUT_MS_ENV, scheduler_); + unset_env_var(PTO2_OP_EXECUTE_TIMEOUT_US_ENV); + unset_env_var(PTO2_STREAM_SYNC_TIMEOUT_MS_ENV); + unset_env_var(PTO2_SCHEDULER_TIMEOUT_MS_ENV); + } + + ~ScopedUnsetTimeoutEnv() { + restore(PTO2_OP_EXECUTE_TIMEOUT_US_ENV, op_); + restore(PTO2_STREAM_SYNC_TIMEOUT_MS_ENV, stream_); + restore(PTO2_SCHEDULER_TIMEOUT_MS_ENV, scheduler_); + } + +private: + struct SavedValue { + bool was_set{false}; + std::string value; + }; + + static void save(const char *name, SavedValue &out) { + const char *value = std::getenv(name); + if (value != nullptr) { + out.was_set = true; + out.value = value; + } + } + + static void restore(const char *name, const SavedValue &saved) { + if (saved.was_set) { + set_env_var(name, saved.value.c_str()); + } else { + unset_env_var(name); + } + } + + SavedValue op_; + SavedValue stream_; + SavedValue scheduler_; +}; + +} // namespace + +TEST(RuntimeTimeoutConfig, UnsetEnvKeepsDefaults) { + ScopedUnsetTimeoutEnv env; + RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config(kDefaults); + + EXPECT_EQ(cfg.op_execute_timeout_us, 3000000u); + EXPECT_EQ(cfg.stream_sync_timeout_ms, 4000); + EXPECT_EQ(cfg.scheduler_timeout_ms, 2000); +} + +TEST(RuntimeTimeoutConfig, ValidEnvOverridesDefaults) { + ScopedUnsetTimeoutEnv env; + set_env_var(PTO2_OP_EXECUTE_TIMEOUT_US_ENV, "5000000"); + set_env_var(PTO2_STREAM_SYNC_TIMEOUT_MS_ENV, "7000"); + set_env_var(PTO2_SCHEDULER_TIMEOUT_MS_ENV, "3000"); + + RuntimeTimeoutParseStatus status; + RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config(kDefaults, &status); + + EXPECT_EQ(cfg.op_execute_timeout_us, 5000000u); + EXPECT_EQ(cfg.stream_sync_timeout_ms, 7000); + EXPECT_EQ(cfg.scheduler_timeout_ms, 3000); + EXPECT_TRUE(status.op_execute_env_set); + EXPECT_TRUE(status.stream_sync_env_set); + EXPECT_TRUE(status.scheduler_env_set); + EXPECT_TRUE(status.op_execute_valid); + EXPECT_TRUE(status.stream_sync_valid); + EXPECT_TRUE(status.scheduler_valid); + EXPECT_EQ(validate_runtime_timeout_order(cfg), RuntimeTimeoutOrderStatus::OK); +} + +TEST(RuntimeTimeoutConfig, InvalidEnvKeepsDefaultAndReportsStatus) { + ScopedUnsetTimeoutEnv env; + set_env_var(PTO2_OP_EXECUTE_TIMEOUT_US_ENV, "12ms"); + + RuntimeTimeoutParseStatus status; + RuntimeTimeoutConfig cfg = resolve_runtime_timeout_config(kDefaults, &status); + + EXPECT_EQ(cfg.op_execute_timeout_us, 3000000u); + EXPECT_TRUE(status.op_execute_env_set); + EXPECT_FALSE(status.op_execute_valid); +} + +TEST(RuntimeTimeoutConfig, ReusedParseStatusStartsClean) { + ScopedUnsetTimeoutEnv env; + RuntimeTimeoutParseStatus status; + + set_env_var(PTO2_OP_EXECUTE_TIMEOUT_US_ENV, "5000000"); + resolve_runtime_timeout_config(kDefaults, &status); + EXPECT_TRUE(status.op_execute_env_set); + EXPECT_TRUE(status.op_execute_valid); + + unset_env_var(PTO2_OP_EXECUTE_TIMEOUT_US_ENV); + resolve_runtime_timeout_config(kDefaults, &status); + EXPECT_FALSE(status.op_execute_env_set); + EXPECT_TRUE(status.op_execute_valid); +} + +TEST(RuntimeTimeoutConfig, HostConfigIgnoresSchedulerEnv) { + ScopedUnsetTimeoutEnv env; + set_env_var(PTO2_SCHEDULER_TIMEOUT_MS_ENV, "8000"); + + RuntimeTimeoutParseStatus status; + HostRuntimeTimeoutConfig cfg = resolve_host_runtime_timeout_config(kDefaults, &status); + + EXPECT_EQ(cfg.op_execute_timeout_us, 3000000u); + EXPECT_EQ(cfg.stream_sync_timeout_ms, 4000); + EXPECT_FALSE(status.scheduler_env_set); + EXPECT_TRUE(status.scheduler_valid); +} + +TEST(RuntimeTimeoutConfig, InvalidTokenKeepsPriorValue) { + uint64_t value = 42; + + EXPECT_FALSE(apply_runtime_timeout_override("PTO2_OP_EXECUTE_TIMEOUT_US", "12ms", 1, UINT64_MAX, &value)); + EXPECT_EQ(value, 42u); +} + +TEST(RuntimeTimeoutConfig, RejectsBrokenOrdering) { + RuntimeTimeoutConfig cfg = kDefaults; + + cfg.scheduler_timeout_ms = 3500; + EXPECT_EQ(validate_runtime_timeout_order(cfg), RuntimeTimeoutOrderStatus::SCHEDULER_NOT_BELOW_OP_EXECUTE); + + cfg = kDefaults; + cfg.op_execute_timeout_us = 4500000; + EXPECT_EQ(validate_runtime_timeout_order(cfg), RuntimeTimeoutOrderStatus::OP_EXECUTE_NOT_BELOW_STREAM_SYNC); + + cfg = kDefaults; + cfg.scheduler_timeout_ms = 2500; + EXPECT_EQ(validate_runtime_timeout_order(cfg), RuntimeTimeoutOrderStatus::STREAM_SYNC_NOT_COVERING_SCHEDULER_GUARD); +} + +TEST(RuntimeTimeoutConfig, SimPlatformSkipsOnboardOrdering) { + RuntimeTimeoutConfig cfg = kDefaults; + cfg.scheduler_timeout_ms = 8000; + + EXPECT_EQ(validate_runtime_timeout_order_for_platform(cfg, "a2a3sim"), RuntimeTimeoutOrderStatus::OK); + EXPECT_EQ(validate_runtime_timeout_order_for_platform(cfg, "a5sim"), RuntimeTimeoutOrderStatus::OK); + EXPECT_EQ( + validate_runtime_timeout_order_for_platform(cfg, "a2a3"), + RuntimeTimeoutOrderStatus::SCHEDULER_NOT_BELOW_OP_EXECUTE + ); + EXPECT_EQ( + validate_runtime_timeout_order_for_platform(cfg, "a5"), + RuntimeTimeoutOrderStatus::SCHEDULER_NOT_BELOW_OP_EXECUTE + ); +}