From 3623858726878a81623853e9797468a571814cdb Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Wed, 27 May 2026 20:29:27 +0800 Subject: [PATCH] Refactor: extract DeviceArgs + KernelArgsHelper to src/common/platform/onboard/host MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires up a new shared module `src/common/platform/onboard/host/` for code that's line-identical between a2a3 and a5 onboard `libhost_runtime.so`, and moves the first two structs into it: - `DeviceArgs` (per-task AICPU device-args, layout identical on both archs). - `KernelArgsHelper` (6 common methods: init/finalize × {device_args, runtime_args, device_kernel_args}). a2a3-only extensions (which can't live on the common struct because a5's `KernelArgs` has no matching field): - `init_ffts_base_addr` becomes a free function `kernel_args_init_ffts_base_addr(KernelArgsHelper &)` in a2a3's `device_runner.h`. One callsite updated. Build / link: - Both arch `CMakeLists.txt` add `src/common/platform/onboard/host` to the include path and pull in `device_runner_helpers.cpp`. - The common header `#include "common/kernel_args.h"` resolves to the arch-specific `KernelArgs` layout via each arch's existing include path — no change in arch ABI. Net delta: -185 LoC (-400 / +215 across 6 files). ABI / Python surface unchanged. Sim onboard untouched. Local validation (a2a3 onboard, device 1): - vector_example: PASS - aicore_op_timeout: PASS See `.docs/ONBOARD_HOST_COMMON_REFACTOR.md` for the full migration plan (this is PR 1 + PR 2 of the planned series — scaffolding + first extraction collapsed into one PR after PR #877 removed `AicpuSoInfo`). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/a2a3/platform/onboard/host/CMakeLists.txt | 7 + .../platform/onboard/host/device_runner.cpp | 103 +-------------- .../platform/onboard/host/device_runner.h | 105 +-------------- src/a5/platform/onboard/host/CMakeLists.txt | 7 + .../platform/onboard/host/device_runner.cpp | 98 -------------- src/a5/platform/onboard/host/device_runner.h | 94 +------------- .../onboard/host/device_runner_helpers.cpp | 117 +++++++++++++++++ .../onboard/host/device_runner_helpers.h | 122 ++++++++++++++++++ 8 files changed, 267 insertions(+), 386 deletions(-) create mode 100644 src/common/platform/onboard/host/device_runner_helpers.cpp create mode 100644 src/common/platform/onboard/host/device_runner_helpers.h diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index e607e5fd7..4750fe6d9 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -24,6 +24,7 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../c list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -65,6 +66,12 @@ list(APPEND HOST_RUNTIME_SOURCES list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" ) +# Add common/platform/onboard/host sources (shared between a2a3 / a5 onboard). +# Currently scaffolding only — real content moves here in follow-up PRs +# (see .docs/ONBOARD_HOST_COMMON_REFACTOR.md). +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_helpers.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB_RECURSE DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 3079c9aad..9fc408a6a 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -92,75 +92,10 @@ HalHostUnregisterFn get_halHostUnregister() { } // namespace // ============================================================================= -// KernelArgsHelper Implementation +// a2a3-only KernelArgsHelper extension // ============================================================================= -int KernelArgsHelper::init_device_args(const DeviceArgs &host_device_args, MemoryAllocator &allocator) { - allocator_ = &allocator; - - // Allocate device memory for device_args - if (args.device_args == nullptr) { - uint64_t device_args_size = sizeof(DeviceArgs); - void *device_args_dev = allocator_->alloc(device_args_size); - if (device_args_dev == nullptr) { - LOG_ERROR("Alloc for device_args failed"); - return -1; - } - args.device_args = reinterpret_cast(device_args_dev); - } - // Copy host_device_args to device memory via device_args - int rc = - rtMemcpy(args.device_args, sizeof(DeviceArgs), &host_device_args, sizeof(DeviceArgs), RT_MEMCPY_HOST_TO_DEVICE); - if (rc != 0) { - LOG_ERROR("rtMemcpy failed: %d", rc); - allocator_->free(args.device_args); - args.device_args = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::finalize_device_args() { - if (args.device_args != nullptr && allocator_ != nullptr) { - int rc = allocator_->free(args.device_args); - args.device_args = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::init_runtime_args(const Runtime &host_runtime, MemoryAllocator &allocator) { - allocator_ = &allocator; - - if (args.runtime_args == nullptr) { - uint64_t runtime_size = sizeof(Runtime); - void *runtime_dev = allocator_->alloc(runtime_size); - if (runtime_dev == nullptr) { - LOG_ERROR("Alloc for runtime_args failed"); - return -1; - } - args.runtime_args = reinterpret_cast(runtime_dev); - } - int rc = rtMemcpy(args.runtime_args, sizeof(Runtime), &host_runtime, sizeof(Runtime), RT_MEMCPY_HOST_TO_DEVICE); - if (rc != 0) { - LOG_ERROR("rtMemcpy for runtime failed: %d", rc); - allocator_->free(args.runtime_args); - args.runtime_args = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::finalize_runtime_args() { - if (args.runtime_args != nullptr && allocator_ != nullptr) { - int rc = allocator_->free(args.runtime_args); - args.runtime_args = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::init_ffts_base_addr() { +int kernel_args_init_ffts_base_addr(KernelArgsHelper &helper) { uint64_t ffts_base_addr{0}; uint32_t ffts_len{0}; int rc = rtGetC2cCtrlAddr(&ffts_base_addr, &ffts_len); @@ -168,37 +103,7 @@ int KernelArgsHelper::init_ffts_base_addr() { LOG_ERROR("rtGetC2cCtrlAddr failed: %d", rc); return rc; } - args.ffts_base_addr = ffts_base_addr; - return 0; -} - -int KernelArgsHelper::init_device_kernel_args(MemoryAllocator &allocator) { - allocator_ = &allocator; - - if (device_k_args_ == nullptr) { - void *dev_ptr = allocator_->alloc(sizeof(KernelArgs)); - if (dev_ptr == nullptr) { - LOG_ERROR("Alloc for device KernelArgs failed"); - return -1; - } - device_k_args_ = reinterpret_cast(dev_ptr); - } - int rc = rtMemcpy(device_k_args_, sizeof(KernelArgs), &args, sizeof(KernelArgs), RT_MEMCPY_HOST_TO_DEVICE); - if (rc != 0) { - LOG_ERROR("rtMemcpy for KernelArgs failed: %d", rc); - allocator_->free(device_k_args_); - device_k_args_ = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::finalize_device_kernel_args() { - if (device_k_args_ != nullptr && allocator_ != nullptr) { - int rc = allocator_->free(device_k_args_); - device_k_args_ = nullptr; - return rc; - } + helper.args.ffts_base_addr = ffts_base_addr; return 0; } @@ -763,7 +668,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { kernel_args_.args.log_level = static_cast(HostLogger::get_instance().level()); kernel_args_.args.log_info_v = static_cast(HostLogger::get_instance().info_v()); - rc = kernel_args_.init_ffts_base_addr(); + rc = kernel_args_init_ffts_base_addr(kernel_args_); if (rc != 0) { LOG_ERROR("init_ffts_base_addr failed: %d", rc); return rc; diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index c58fb2b99..bac1ff9a2 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -44,6 +44,7 @@ #include "common/platform_config.h" #include "common/unified_log.h" #include "device_arena.h" +#include "device_runner_helpers.h" // common DeviceArgs + KernelArgsHelper #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/l2_perf_collector.h" @@ -54,106 +55,14 @@ #include "runtime.h" /** - * DeviceArgs structure for AICPU device arguments. + * a2a3-only `KernelArgsHelper` extension: retrieve the FFTS base address via + * `rtGetC2cCtrlAddr` and store it in the wrapped `KernelArgs`. a5's + * `KernelArgs` has no `ffts_base_addr` field, so this helper lives in the + * arch-specific header rather than on the common `KernelArgsHelper` struct. * - * Layout offsets are still nominally fixed by libaicpu_extend_kernels.so for - * aicpu_so_bin / aicpu_so_len (at offsets 96 / 104), but per-task AICPU - * launches go through rtsLaunchCpuKernel against the cached rtFuncHandle on - * LoadAicpuOp — none of our code reads these fields. The fields are kept - * (zero-initialized, never assigned) so the H2D struct layout matches the - * historical contract on both archs; an earlier "the H2D allocation pointed - * to by aicpu_so_bin is load-bearing on a5 onboard" finding no longer - * reproduces against current HEAD (post #864/#870), so the runner-side - * AicpuSoInfo allocation was removed. + * @return 0 on success, error code on failure. */ -struct DeviceArgs { - uint64_t unused[12] = {0}; - uint64_t aicpu_so_bin{0}; - uint64_t aicpu_so_len{0}; -}; - -/** - * Helper class for managing KernelArgs with device memory - * - * This class wraps KernelArgs and provides host-side initialization methods - * for allocating device memory and copying data to the device. It separates - * the concerns of device memory management (host-only) from the structure - * layout (shared with kernels). - * - * The helper provides implicit conversion to KernelArgs* for seamless use - * with runtime APIs. - */ -struct KernelArgsHelper { - KernelArgs args; - MemoryAllocator *allocator_{nullptr}; - KernelArgs *device_k_args_{nullptr}; // Device copy of KernelArgs for AICore - - /** - * Initialize device arguments by allocating device memory and copying data - * - * @param host_device_args Host-side device arguments to copy - * @param allocator Memory allocator to use - * @return 0 on success, error code on failure - */ - int init_device_args(const DeviceArgs &host_device_args, MemoryAllocator &allocator); - - /** - * Free device memory allocated for device arguments - * - * @return 0 on success, error code on failure - */ - int finalize_device_args(); - - /** - * Initialize runtime arguments by allocating device memory and copying data - * - * @param host_runtime Host-side runtime to copy to device - * @param allocator Memory allocator to use - * @return 0 on success, error code on failure - */ - int init_runtime_args(const Runtime &host_runtime, MemoryAllocator &allocator); - - /** - * Free device memory allocated for runtime arguments - * - * @return 0 on success, error code on failure - */ - int finalize_runtime_args(); - - /** - * Retrieve FFTS base address via rtGetC2cCtrlAddr and store in KernelArgs - * - * @return 0 on success, error code on failure - */ - int init_ffts_base_addr(); - - /** - * Copy KernelArgs to device memory for AICore kernel parameter passing - * - * Must be called after init_runtime_args and init_ffts_base_addr. - * - * @param allocator Memory allocator to use - * @return 0 on success, error code on failure - */ - int init_device_kernel_args(MemoryAllocator &allocator); - - /** - * Free device memory allocated for KernelArgs copy - * - * @return 0 on success, error code on failure - */ - int finalize_device_kernel_args(); - - /** - * Implicit conversion operators for seamless use with runtime APIs - * - * These operators allow KernelArgsHelper to be used wherever KernelArgs* - * is expected, enabling transparent device memory management while - * maintaining API compatibility. - */ - operator KernelArgs *() { return &args; } - KernelArgs *operator&() { return &args; } -}; +int kernel_args_init_ffts_base_addr(KernelArgsHelper &helper); /** * Device runner for kernel execution diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index c1a006cef..b280f5fbb 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -25,6 +25,7 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../c list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -50,6 +51,12 @@ list(APPEND HOST_RUNTIME_SOURCES list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" ) +# Add common/platform/onboard/host sources (shared between a2a3 / a5 onboard). +# Currently scaffolding only — real content moves here in follow-up PRs +# (see .docs/ONBOARD_HOST_COMMON_REFACTOR.md). +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_helpers.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB_RECURSE DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 2d7b1e0fa..bdfb77f27 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -38,104 +38,6 @@ #include "host/host_regs.h" // Register address retrieval #include "host/raii_scope_guard.h" -// ============================================================================= -// KernelArgsHelper Implementation -// ============================================================================= - -int KernelArgsHelper::init_device_args(const DeviceArgs &host_device_args, MemoryAllocator &allocator) { - allocator_ = &allocator; - - // Allocate device memory for device_args - if (args.device_args == nullptr) { - uint64_t device_args_size = sizeof(DeviceArgs); - void *device_args_dev = allocator_->alloc(device_args_size); - if (device_args_dev == nullptr) { - LOG_ERROR("Alloc for device_args failed"); - return -1; - } - args.device_args = reinterpret_cast(device_args_dev); - } - // Copy host_device_args to device memory via device_args - int rc = - rtMemcpy(args.device_args, sizeof(DeviceArgs), &host_device_args, sizeof(DeviceArgs), RT_MEMCPY_HOST_TO_DEVICE); - if (rc != 0) { - LOG_ERROR("rtMemcpy failed: %d", rc); - allocator_->free(args.device_args); - args.device_args = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::finalize_device_args() { - if (args.device_args != nullptr && allocator_ != nullptr) { - int rc = allocator_->free(args.device_args); - args.device_args = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::init_runtime_args(const Runtime &host_runtime, MemoryAllocator &allocator) { - allocator_ = &allocator; - - if (args.runtime_args == nullptr) { - uint64_t runtime_size = sizeof(Runtime); - void *runtime_dev = allocator_->alloc(runtime_size); - if (runtime_dev == nullptr) { - LOG_ERROR("Alloc for runtime_args failed"); - return -1; - } - args.runtime_args = reinterpret_cast(runtime_dev); - } - int rc = rtMemcpy(args.runtime_args, sizeof(Runtime), &host_runtime, sizeof(Runtime), RT_MEMCPY_HOST_TO_DEVICE); - if (rc != 0) { - LOG_ERROR("rtMemcpy for runtime failed: %d", rc); - allocator_->free(args.runtime_args); - args.runtime_args = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::finalize_runtime_args() { - if (args.runtime_args != nullptr && allocator_ != nullptr) { - int rc = allocator_->free(args.runtime_args); - args.runtime_args = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::init_device_kernel_args(MemoryAllocator &allocator) { - allocator_ = &allocator; - if (device_k_args_ == nullptr) { - void *dev_ptr = allocator_->alloc(sizeof(KernelArgs)); - if (dev_ptr == nullptr) { - LOG_ERROR("Alloc for device KernelArgs failed"); - return -1; - } - device_k_args_ = reinterpret_cast(dev_ptr); - } - int rc = rtMemcpy(device_k_args_, sizeof(KernelArgs), &args, sizeof(KernelArgs), RT_MEMCPY_HOST_TO_DEVICE); - if (rc != 0) { - LOG_ERROR("rtMemcpy for KernelArgs failed: %d", rc); - allocator_->free(device_k_args_); - device_k_args_ = nullptr; - return rc; - } - return 0; -} - -int KernelArgsHelper::finalize_device_kernel_args() { - if (device_k_args_ != nullptr && allocator_ != nullptr) { - int rc = allocator_->free(device_k_args_); - device_k_args_ = nullptr; - return rc; - } - return 0; -} - // ============================================================================= // DeviceRunner Implementation // ============================================================================= diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 142536426..86a0175f5 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -39,6 +39,7 @@ #include "callable.h" #include "prepare_callable_common.h" #include "device_arena.h" +#include "device_runner_helpers.h" // common DeviceArgs + KernelArgsHelper #include "common/kernel_args.h" #include "common/memory_barrier.h" #include "common/l2_perf_profiling.h" @@ -52,97 +53,8 @@ #include "load_aicpu_op.h" #include "runtime.h" -/** - * DeviceArgs structure for AICPU device arguments. - * - * Layout offsets are still nominally fixed by libaicpu_extend_kernels.so for - * aicpu_so_bin / aicpu_so_len (at offsets 96 / 104), but per-task AICPU - * launches go through rtsLaunchCpuKernel against the cached rtFuncHandle on - * LoadAicpuOp — none of our code reads these fields. The fields are kept - * (zero-initialized, never assigned) so the H2D struct layout matches the - * historical contract on both archs; an earlier "the H2D allocation pointed - * to by aicpu_so_bin is load-bearing on a5 onboard" finding no longer - * reproduces against current HEAD (post #864/#870), so the runner-side - * AicpuSoInfo allocation was removed. - */ -struct DeviceArgs { - uint64_t unused[12] = {0}; - uint64_t aicpu_so_bin{0}; - uint64_t aicpu_so_len{0}; -}; - -/** - * Helper class for managing KernelArgs with device memory - * - * This class wraps KernelArgs and provides host-side initialization methods - * for allocating device memory and copying data to the device. It separates - * the concerns of device memory management (host-only) from the structure - * layout (shared with kernels). - * - * The helper provides implicit conversion to KernelArgs* for seamless use - * with runtime APIs. - */ -struct KernelArgsHelper { - KernelArgs args; - MemoryAllocator *allocator_{nullptr}; - KernelArgs *device_k_args_{nullptr}; // Device pointer (populated by init_device_kernel_args) - - /** - * Initialize device arguments by allocating device memory and copying data - * - * @param host_device_args Host-side device arguments to copy - * @param allocator Memory allocator to use - * @return 0 on success, error code on failure - */ - int init_device_args(const DeviceArgs &host_device_args, MemoryAllocator &allocator); - - /** - * Free device memory allocated for device arguments - * - * @return 0 on success, error code on failure - */ - int finalize_device_args(); - - /** - * Initialize runtime arguments by allocating device memory and copying data - * - * @param host_runtime Host-side runtime to copy to device - * @param allocator Memory allocator to use - * @return 0 on success, error code on failure - */ - int init_runtime_args(const Runtime &host_runtime, MemoryAllocator &allocator); - - /** - * Free device memory allocated for runtime arguments - * - * @return 0 on success, error code on failure - */ - int finalize_runtime_args(); - - /** - * Allocate device memory for the host-resident KernelArgs and copy the - * struct over. AICore's KERNEL_ENTRY expects a KernelArgs* (not a - * Runtime*) so it can read the profiling enablement bits + ring address - * tables and forward them into AICore platform state. Call this after - * every kernel_args.args.* field is populated for the run. - */ - int init_device_kernel_args(MemoryAllocator &allocator); - - /** - * Free device memory allocated for the device-resident KernelArgs copy. - */ - int finalize_device_kernel_args(); - - /** - * Implicit conversion operators for seamless use with runtime APIs - * - * These operators allow KernelArgsHelper to be used wherever KernelArgs* - * is expected, enabling transparent device memory management while - * maintaining API compatibility. - */ - operator KernelArgs *() { return &args; } - KernelArgs *operator&() { return &args; } -}; +// DeviceArgs + KernelArgsHelper are defined in +// src/common/platform/onboard/host/device_runner_helpers.h (included above). /** * Device runner for kernel execution diff --git a/src/common/platform/onboard/host/device_runner_helpers.cpp b/src/common/platform/onboard/host/device_runner_helpers.cpp new file mode 100644 index 000000000..70d2c0ddd --- /dev/null +++ b/src/common/platform/onboard/host/device_runner_helpers.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Onboard host common helpers — `KernelArgsHelper` implementation. + * + * Linked into both a2a3 and a5 `libhost_runtime.so`. The arch-specific + * `KernelArgs` layout is brought in via `common/kernel_args.h` on the + * include path (each arch CMake adds the right one). + */ + +#include "device_runner_helpers.h" + +#include + +#include "common/unified_log.h" + +int KernelArgsHelper::init_device_args(const DeviceArgs &host_device_args, MemoryAllocator &allocator) { + allocator_ = &allocator; + + // Allocate device memory for device_args + if (args.device_args == nullptr) { + uint64_t device_args_size = sizeof(DeviceArgs); + void *device_args_dev = allocator_->alloc(device_args_size); + if (device_args_dev == nullptr) { + LOG_ERROR("Alloc for device_args failed"); + return -1; + } + args.device_args = reinterpret_cast(device_args_dev); + } + // Copy host_device_args to device memory via device_args + int rc = + rtMemcpy(args.device_args, sizeof(DeviceArgs), &host_device_args, sizeof(DeviceArgs), RT_MEMCPY_HOST_TO_DEVICE); + if (rc != 0) { + LOG_ERROR("rtMemcpy failed: %d", rc); + allocator_->free(args.device_args); + args.device_args = nullptr; + return rc; + } + return 0; +} + +int KernelArgsHelper::finalize_device_args() { + if (args.device_args != nullptr && allocator_ != nullptr) { + int rc = allocator_->free(args.device_args); + args.device_args = nullptr; + return rc; + } + return 0; +} + +int KernelArgsHelper::init_runtime_args(const Runtime &host_runtime, MemoryAllocator &allocator) { + allocator_ = &allocator; + + if (args.runtime_args == nullptr) { + uint64_t runtime_size = sizeof(Runtime); + void *runtime_dev = allocator_->alloc(runtime_size); + if (runtime_dev == nullptr) { + LOG_ERROR("Alloc for runtime_args failed"); + return -1; + } + args.runtime_args = reinterpret_cast(runtime_dev); + } + int rc = rtMemcpy(args.runtime_args, sizeof(Runtime), &host_runtime, sizeof(Runtime), RT_MEMCPY_HOST_TO_DEVICE); + if (rc != 0) { + LOG_ERROR("rtMemcpy for runtime failed: %d", rc); + allocator_->free(args.runtime_args); + args.runtime_args = nullptr; + return rc; + } + return 0; +} + +int KernelArgsHelper::finalize_runtime_args() { + if (args.runtime_args != nullptr && allocator_ != nullptr) { + int rc = allocator_->free(args.runtime_args); + args.runtime_args = nullptr; + return rc; + } + return 0; +} + +int KernelArgsHelper::init_device_kernel_args(MemoryAllocator &allocator) { + allocator_ = &allocator; + if (device_k_args_ == nullptr) { + void *dev_ptr = allocator_->alloc(sizeof(KernelArgs)); + if (dev_ptr == nullptr) { + LOG_ERROR("Alloc for device KernelArgs failed"); + return -1; + } + device_k_args_ = reinterpret_cast(dev_ptr); + } + int rc = rtMemcpy(device_k_args_, sizeof(KernelArgs), &args, sizeof(KernelArgs), RT_MEMCPY_HOST_TO_DEVICE); + if (rc != 0) { + LOG_ERROR("rtMemcpy for KernelArgs failed: %d", rc); + allocator_->free(device_k_args_); + device_k_args_ = nullptr; + return rc; + } + return 0; +} + +int KernelArgsHelper::finalize_device_kernel_args() { + if (device_k_args_ != nullptr && allocator_ != nullptr) { + int rc = allocator_->free(device_k_args_); + device_k_args_ = nullptr; + return rc; + } + return 0; +} diff --git a/src/common/platform/onboard/host/device_runner_helpers.h b/src/common/platform/onboard/host/device_runner_helpers.h new file mode 100644 index 000000000..7630d65c7 --- /dev/null +++ b/src/common/platform/onboard/host/device_runner_helpers.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Onboard host common helpers — shared between a2a3 and a5 onboard host + * runtime libraries (`libhost_runtime.so`). + * + * Migration target for code that's line-identical between arches; arch-specific + * extensions (e.g. a2a3's `init_ffts_base_addr`) live as free functions in + * the arch's own `device_runner.h` rather than being declared here. + * + * Current contents: + * - `DeviceArgs`: per-task AICPU device-args struct (offsets fixed by + * libaicpu_extend_kernels' ABI; layout identical on both archs). + * - `KernelArgsHelper`: host-side `KernelArgs` wrapper with device-memory + * management for the 3 H2D copies (`DeviceArgs`, `Runtime`, `KernelArgs`). + * + * Future migrations (see `.docs/ONBOARD_HOST_COMMON_REFACTOR.md`): + * - `DeviceRunnerBase` (lifecycle + registration + profiling init). + * - C-API common shims. + */ + +#ifndef SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_HELPERS_H +#define SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_HELPERS_H + +#include + +#include "common/kernel_args.h" // arch-specific KernelArgs layout +#include "host/memory_allocator.h" +#include "runtime.h" + +/** + * DeviceArgs structure for AICPU device arguments. + * + * Layout offsets are still nominally fixed by libaicpu_extend_kernels.so for + * `aicpu_so_bin` / `aicpu_so_len` (at offsets 96 / 104), but per-task AICPU + * launches go through `rtsLaunchCpuKernel` against the cached `rtFuncHandle` + * on `LoadAicpuOp` — none of our code reads these fields. The fields are + * kept (zero-initialized, never assigned) so the H2D struct layout matches + * the historical contract on both archs; the runner-side `AicpuSoInfo` + * allocation that used to back them was removed in PR #877. + */ +struct DeviceArgs { + uint64_t unused[12] = {0}; + uint64_t aicpu_so_bin{0}; + uint64_t aicpu_so_len{0}; +}; + +/** + * Helper class for managing `KernelArgs` with device memory. + * + * Wraps `KernelArgs` (defined per-arch in `common/kernel_args.h`) and provides + * host-side initialization methods for allocating device memory and copying + * data to the device. Separates device-memory management (host-only) from the + * structure layout (shared with kernels). + * + * The helper provides implicit conversion to `KernelArgs *` for seamless use + * with runtime APIs. + * + * Arch-specific extensions (a2a3-only `init_ffts_base_addr`, etc.) live as + * free functions in the arch's own `device_runner.h`. + */ +struct KernelArgsHelper { + KernelArgs args; + MemoryAllocator *allocator_{nullptr}; + KernelArgs *device_k_args_{nullptr}; // Device copy of KernelArgs for AICore + + /** + * Initialize device arguments by allocating device memory and copying data. + * + * @param host_device_args Host-side device arguments to copy. + * @param allocator Memory allocator to use. + * @return 0 on success, error code on failure. + */ + int init_device_args(const DeviceArgs &host_device_args, MemoryAllocator &allocator); + + /** Free device memory allocated for device arguments. */ + int finalize_device_args(); + + /** + * Initialize runtime arguments by allocating device memory and copying data. + * + * @param host_runtime Host-side runtime to copy to device. + * @param allocator Memory allocator to use. + * @return 0 on success, error code on failure. + */ + int init_runtime_args(const Runtime &host_runtime, MemoryAllocator &allocator); + + /** Free device memory allocated for runtime arguments. */ + int finalize_runtime_args(); + + /** + * Allocate device memory for the host-resident `KernelArgs` and copy the + * struct over. AICore's `KERNEL_ENTRY` expects a `KernelArgs *` (not a + * `Runtime *`) so it can read the profiling enablement bits + ring address + * tables and forward them into AICore platform state. Call this after + * every `kernel_args.args.*` field is populated for the run. + */ + int init_device_kernel_args(MemoryAllocator &allocator); + + /** Free device memory allocated for the device-resident `KernelArgs` copy. */ + int finalize_device_kernel_args(); + + /** + * Implicit conversion operators for seamless use with runtime APIs. + * + * These allow `KernelArgsHelper` to be used wherever `KernelArgs *` is + * expected, enabling transparent device memory management while + * maintaining API compatibility. + */ + operator KernelArgs *() { return &args; } + KernelArgs *operator&() { return &args; } +}; + +#endif // SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_HELPERS_H