From 72998141912a0b7caf6563d0e701468342367b23 Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Thu, 28 May 2026 14:30:51 +0800 Subject: [PATCH] Refactor: extract DeviceRunnerBase + tensor/arena methods to common MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR 2 of the multi-PR refactor described in `.docs/ONBOARD_HOST_COMMON_REFACTOR.md`. Introduces the `DeviceRunnerBase` class and moves the first set of line-identical methods + their load-bearing state from each arch's `DeviceRunner` into the shared base. Moves (both archs): | Symbol | From | To | | --- | --- | --- | | `mem_alloc_` (MemoryAllocator) | each arch's DeviceRunner | `DeviceRunnerBase` | | `gm_heap_arena_`, `gm_sm_arena_`, `runtime_arena_pool_` (DeviceArena × 3) | each arch's DeviceRunner | `DeviceRunnerBase` | | `arena_alloc_trampoline`, `arena_free_trampoline` (static fns) | each arch's DeviceRunner | `DeviceRunnerBase` | | `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device` | each arch's DeviceRunner | `DeviceRunnerBase` | | `acquire_pooled_gm_heap`, `acquire_pooled_gm_sm`, `acquire_pooled_runtime_arena` | each arch's DeviceRunner | `DeviceRunnerBase` | Each arch's `DeviceRunner` now `: public DeviceRunnerBase`. The arch's default constructor (`= default`) implicitly chains to the base ctor, which wires the three arenas to the inherited `mem_alloc_` via the inherited trampolines. Subclass keeps everything else (streams, kernel args, profiling collectors, callable registration, etc.) for now — those move in subsequent PRs (Groups C/D/E in the design doc). Non-virtual base destructor: the dlsym surface (`destroy_device_context`) always casts to the arch's concrete `DeviceRunner` before `delete`, so no polymorphic delete happens via a base pointer. Documented inline. ABI / Python surface unchanged. Local validation (a2a3 onboard, device 1, pinned PTO-ISA): - `vector_example`: PASS Co-Authored-By: Claude Opus 4.7 (1M context) --- src/a2a3/platform/onboard/host/CMakeLists.txt | 4 +- .../platform/onboard/host/device_runner.cpp | 33 ------- .../platform/onboard/host/device_runner.h | 89 +++-------------- src/a5/platform/onboard/host/CMakeLists.txt | 4 +- .../platform/onboard/host/device_runner.cpp | 33 ------- src/a5/platform/onboard/host/device_runner.h | 89 +++-------------- .../onboard/host/device_runner_base.cpp | 59 +++++++++++ .../onboard/host/device_runner_base.h | 99 +++++++++++++++++++ 8 files changed, 188 insertions(+), 222 deletions(-) create mode 100644 src/common/platform/onboard/host/device_runner_base.cpp create mode 100644 src/common/platform/onboard/host/device_runner_base.h diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index 16abd2e68..ac5f12b9f 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -68,10 +68,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" ) # Add common/platform/onboard/host sources (shared between a2a3 / a5 onboard). -# Currently scaffolding only — real content moves here in follow-up PRs -# (see .docs/ONBOARD_HOST_COMMON_REFACTOR.md). +# See .docs/ONBOARD_HOST_COMMON_REFACTOR.md for the migration plan. list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_helpers.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_base.cpp" ) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 74993a79b..f4039e17c 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -161,23 +161,6 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz return 0; } -void *DeviceRunner::acquire_pooled_gm_heap() { - if (!gm_heap_arena_.is_committed()) return nullptr; - return gm_heap_arena_.base(); -} - -void *DeviceRunner::acquire_pooled_gm_sm() { - if (!gm_sm_arena_.is_committed()) return nullptr; - return gm_sm_arena_.base(); -} - -void *DeviceRunner::acquire_pooled_runtime_arena() { - // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_ - // uncommitted — fail loudly if a caller asks for it anyway. - if (!runtime_arena_pool_.is_committed()) return nullptr; - return runtime_arena_pool_.base(); -} - std::thread DeviceRunner::create_thread(std::function fn) { int dev_id = device_id_; return std::thread([dev_id, fn = std::move(fn)]() { @@ -406,22 +389,6 @@ int DeviceRunner::ensure_binaries_loaded() { return 0; } -void *DeviceRunner::allocate_tensor(size_t bytes) { return mem_alloc_.alloc(bytes); } - -void DeviceRunner::free_tensor(void *dev_ptr) { - if (dev_ptr != nullptr) { - mem_alloc_.free(dev_ptr); - } -} - -int DeviceRunner::copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes) { - return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE); -} - -int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes) { - return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST); -} - int DeviceRunner::query_max_block_dim(rtStream_t stream, uint32_t *out_cube, uint32_t *out_vector) { uint32_t cube_limit = 0, vector_limit = 0; bool got_limits = (aclrtGetStreamResLimit(stream, ACL_RT_DEV_RES_CUBE_CORE, &cube_limit) == ACL_ERROR_NONE) && diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index b7934686c..c35a5f65b 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -44,6 +44,7 @@ #include "common/platform_config.h" #include "common/unified_log.h" #include "device_arena.h" +#include "device_runner_base.h" // common DeviceRunnerBase #include "device_runner_helpers.h" // common DeviceArgs + KernelArgsHelper #include "host/function_cache.h" #include "host/memory_allocator.h" @@ -77,12 +78,9 @@ int kernel_args_init_ffts_base_addr(KernelArgsHelper &helper); * - Coordinated execution of both kernel types * - Runtime execution workflow */ -class DeviceRunner { +class DeviceRunner : public DeviceRunnerBase { public: - DeviceRunner() : - gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), - gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), - runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + DeviceRunner() = default; ~DeviceRunner(); /** @@ -92,24 +90,12 @@ class DeviceRunner { * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no * prebuilt runtime arena) — the corresponding arena stays uncommitted. * Returns 0 on success, -1 on failure. - */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); - - /** - * Return the pooled GM heap / PTO2 SM / runtime arena pointer. - * setup_static_arena must have already committed the relevant region; - * otherwise these return nullptr. All pointers are stable for the - * Worker's lifetime; the three underlying device buffers are released - * in `finalize()`. * - * acquire_pooled_runtime_arena() is trb-only — the runtime arena region - * is only committed when setup_static_arena was called with - * runtime_arena_size > 0. Calling it on the hbg path - * (setup_static_arena(...,0)) returns nullptr (well-defined). + * `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device`, + * and `acquire_pooled_{gm_heap,gm_sm,runtime_arena}` are inherited from + * `DeviceRunnerBase`. */ - void *acquire_pooled_gm_heap(); - void *acquire_pooled_gm_sm(); - void *acquire_pooled_runtime_arena(); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** * Create a thread bound to this device. @@ -117,41 +103,6 @@ class DeviceRunner { */ std::thread create_thread(std::function fn); - /** - * Allocate device tensor memory - * - * @param bytes Size of tensor in bytes - * @return Device pointer on success, nullptr on failure - */ - void *allocate_tensor(size_t bytes); - - /** - * Free device tensor memory - * - * @param dev_ptr Device pointer to free - */ - void free_tensor(void *dev_ptr); - - /** - * Copy data from host to device - * - * @param dev_ptr Device pointer - * @param host_ptr Host pointer - * @param bytes Number of bytes to copy - * @return 0 on success, error code on failure - */ - int copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes); - - /** - * Copy data from device to host - * - * @param host_ptr Host pointer - * @param dev_ptr Device pointer - * @param bytes Number of bytes to copy - * @return 0 on success, error code on failure - */ - int copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes); - /** * Execute a runtime * @@ -510,28 +461,14 @@ class DeviceRunner { // AICPU op loader — handles dispatcher bootstrap and per-task launches. host::LoadAicpuOp load_aicpu_op_; - // Memory management - MemoryAllocator mem_alloc_; - - // Three independent per-Worker arenas, each backing a single pooled - // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime - // arena). Split out from a single backing allocation because the - // combined size can exceed the device allocator's largest contiguous - // block — three separate device_malloc calls are friendlier than one - // big one. Released explicitly in finalize() before mem_alloc_.finalize() - // so the underlying buffers do not get freed twice. + // `mem_alloc_`, `gm_heap_arena_`, `gm_sm_arena_`, `runtime_arena_pool_`, + // and the alloc/free trampolines are inherited from `DeviceRunnerBase`. // - // `runtime_arena_pool_` stays unreserved when setup_static_arena was - // invoked with runtime_arena_size == 0 (hbg path). + // Released explicitly in finalize() before mem_alloc_.finalize() so the + // underlying buffers do not get freed twice. `runtime_arena_pool_` stays + // unreserved when setup_static_arena was invoked with + // runtime_arena_size == 0 (hbg path). // - // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_. - static void *arena_alloc_trampoline(void *ctx, size_t size) { - return static_cast(ctx)->alloc(size); - } - static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena gm_heap_arena_; - DeviceArena gm_sm_arena_; - DeviceArena runtime_arena_pool_; // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating // the same buffer when a later worker init asks for an equal-or-smaller // layout on an already-committed arena. diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index eec7e4afd..b7099ff57 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -53,10 +53,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" ) # Add common/platform/onboard/host sources (shared between a2a3 / a5 onboard). -# Currently scaffolding only — real content moves here in follow-up PRs -# (see .docs/ONBOARD_HOST_COMMON_REFACTOR.md). +# See .docs/ONBOARD_HOST_COMMON_REFACTOR.md for the migration plan. list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_helpers.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_base.cpp" ) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 614ca180d..4e35dfdfb 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -110,23 +110,6 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz return 0; } -void *DeviceRunner::acquire_pooled_gm_heap() { - if (!gm_heap_arena_.is_committed()) return nullptr; - return gm_heap_arena_.base(); -} - -void *DeviceRunner::acquire_pooled_gm_sm() { - if (!gm_sm_arena_.is_committed()) return nullptr; - return gm_sm_arena_.base(); -} - -void *DeviceRunner::acquire_pooled_runtime_arena() { - // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_ - // uncommitted — fail loudly if a caller asks for it anyway. - if (!runtime_arena_pool_.is_committed()) return nullptr; - return runtime_arena_pool_.base(); -} - std::thread DeviceRunner::create_thread(std::function fn) { int dev_id = device_id_; return std::thread([dev_id, fn = std::move(fn)]() { @@ -291,22 +274,6 @@ int DeviceRunner::ensure_binaries_loaded() { return 0; } -void *DeviceRunner::allocate_tensor(size_t bytes) { return mem_alloc_.alloc(bytes); } - -void DeviceRunner::free_tensor(void *dev_ptr) { - if (dev_ptr != nullptr) { - mem_alloc_.free(dev_ptr); - } -} - -int DeviceRunner::copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes) { - return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE); -} - -int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes) { - return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST); -} - int DeviceRunner::query_max_block_dim(rtStream_t stream, uint32_t *out_cube, uint32_t *out_vector) { uint32_t cube_limit = 0, vector_limit = 0; bool got_limits = (aclrtGetStreamResLimit(stream, ACL_RT_DEV_RES_CUBE_CORE, &cube_limit) == ACL_ERROR_NONE) && diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 2faa47d6c..f0e5062c4 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -39,6 +39,7 @@ #include "callable.h" #include "prepare_callable_common.h" #include "device_arena.h" +#include "device_runner_base.h" // common DeviceRunnerBase #include "device_runner_helpers.h" // common DeviceArgs + KernelArgsHelper #include "common/kernel_args.h" #include "common/memory_barrier.h" @@ -69,12 +70,9 @@ * - Coordinated execution of both kernel types * - Runtime execution workflow */ -class DeviceRunner { +class DeviceRunner : public DeviceRunnerBase { public: - DeviceRunner() : - gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), - gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), - runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + DeviceRunner() = default; ~DeviceRunner(); /** @@ -84,24 +82,12 @@ class DeviceRunner { * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no * prebuilt runtime arena) — the corresponding arena stays uncommitted. * Returns 0 on success, -1 on failure. - */ - int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); - - /** - * Return the pooled GM heap / PTO2 SM / runtime arena pointer. - * setup_static_arena must have already committed the relevant region; - * otherwise these return nullptr. All pointers are stable for the - * Worker's lifetime; the three underlying device buffers are released - * in `finalize()`. * - * acquire_pooled_runtime_arena() is trb-only — the runtime arena region - * is only committed when setup_static_arena was called with - * runtime_arena_size > 0. Calling it on the hbg path - * (setup_static_arena(...,0)) returns nullptr (well-defined). + * `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device`, + * and `acquire_pooled_{gm_heap,gm_sm,runtime_arena}` are inherited from + * `DeviceRunnerBase`. */ - void *acquire_pooled_gm_heap(); - void *acquire_pooled_gm_sm(); - void *acquire_pooled_runtime_arena(); + int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); /** * Create a thread bound to this device. @@ -109,41 +95,6 @@ class DeviceRunner { */ std::thread create_thread(std::function fn); - /** - * Allocate device tensor memory - * - * @param bytes Size of tensor in bytes - * @return Device pointer on success, nullptr on failure - */ - void *allocate_tensor(size_t bytes); - - /** - * Free device tensor memory - * - * @param dev_ptr Device pointer to free - */ - void free_tensor(void *dev_ptr); - - /** - * Copy data from host to device - * - * @param dev_ptr Device pointer - * @param host_ptr Host pointer - * @param bytes Number of bytes to copy - * @return 0 on success, error code on failure - */ - int copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes); - - /** - * Copy data from device to host - * - * @param host_ptr Host pointer - * @param dev_ptr Device pointer - * @param bytes Number of bytes to copy - * @return 0 on success, error code on failure - */ - int copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes); - /** * Execute a runtime * @@ -423,28 +374,14 @@ class DeviceRunner { // AICPU op loader — handles dispatcher bootstrap and per-task launches. host::LoadAicpuOp load_aicpu_op_; - // Memory management - MemoryAllocator mem_alloc_; - - // Three independent per-Worker arenas, each backing a single pooled - // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime - // arena). Split out from a single backing allocation because the - // combined size can exceed the device allocator's largest contiguous - // block — three separate device_malloc calls are friendlier than one - // big one. Released explicitly in finalize() before mem_alloc_.finalize() - // so the underlying buffers do not get freed twice. + // `mem_alloc_`, `gm_heap_arena_`, `gm_sm_arena_`, `runtime_arena_pool_`, + // and the alloc/free trampolines are inherited from `DeviceRunnerBase`. // - // `runtime_arena_pool_` stays unreserved when setup_static_arena was - // invoked with runtime_arena_size == 0 (hbg path). + // Released explicitly in finalize() before mem_alloc_.finalize() so the + // underlying buffers do not get freed twice. `runtime_arena_pool_` stays + // unreserved when setup_static_arena was invoked with + // runtime_arena_size == 0 (hbg path). // - // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_. - static void *arena_alloc_trampoline(void *ctx, size_t size) { - return static_cast(ctx)->alloc(size); - } - static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } - DeviceArena gm_heap_arena_; - DeviceArena gm_sm_arena_; - DeviceArena runtime_arena_pool_; // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating // a buffer when a later worker init asks for an equal-or-smaller layout. size_t cached_gm_heap_size_{0}; diff --git a/src/common/platform/onboard/host/device_runner_base.cpp b/src/common/platform/onboard/host/device_runner_base.cpp new file mode 100644 index 000000000..4a71ce07f --- /dev/null +++ b/src/common/platform/onboard/host/device_runner_base.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * `DeviceRunnerBase` — tensor-memory wrappers + pooled arena accessors. + * + * Constructor wires the three arenas to call back into `mem_alloc_` via + * the static trampolines declared in the header. Per-region commit is + * still driven by the subclass's `setup_static_arena`. + */ + +#include "device_runner_base.h" + +#include + +DeviceRunnerBase::DeviceRunnerBase() : + gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), + runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} + +void *DeviceRunnerBase::allocate_tensor(std::size_t bytes) { return mem_alloc_.alloc(bytes); } + +void DeviceRunnerBase::free_tensor(void *dev_ptr) { + if (dev_ptr != nullptr) { + mem_alloc_.free(dev_ptr); + } +} + +int DeviceRunnerBase::copy_to_device(void *dev_ptr, const void *host_ptr, std::size_t bytes) { + return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE); +} + +int DeviceRunnerBase::copy_from_device(void *host_ptr, const void *dev_ptr, std::size_t bytes) { + return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST); +} + +void *DeviceRunnerBase::acquire_pooled_gm_heap() { + if (!gm_heap_arena_.is_committed()) return nullptr; + return gm_heap_arena_.base(); +} + +void *DeviceRunnerBase::acquire_pooled_gm_sm() { + if (!gm_sm_arena_.is_committed()) return nullptr; + return gm_sm_arena_.base(); +} + +void *DeviceRunnerBase::acquire_pooled_runtime_arena() { + // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_ + // uncommitted — fail loudly if a caller asks for it anyway. + if (!runtime_arena_pool_.is_committed()) return nullptr; + return runtime_arena_pool_.base(); +} diff --git a/src/common/platform/onboard/host/device_runner_base.h b/src/common/platform/onboard/host/device_runner_base.h new file mode 100644 index 000000000..85b55ad51 --- /dev/null +++ b/src/common/platform/onboard/host/device_runner_base.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Onboard host `DeviceRunnerBase` — common base class for a2a3 and a5 + * onboard `DeviceRunner`s. + * + * This module owns the host-side state and methods that are identical + * between the two onboard arches today: + * - The `MemoryAllocator` and the three `DeviceArena`s (gm heap, PTO2 + * SM, runtime arena) backing the per-Worker pooled regions. + * - The trivial tensor-memory wrappers (`allocate_tensor`, + * `free_tensor`, `copy_*_device`). + * - The arena-pool accessors (`acquire_pooled_gm_heap`, etc.). + * + * Subclasses (`{a2a3,a5}::DeviceRunner`) add arch-specific state + * (streams, kernel args, profiling collectors, callable registration) + * and override behaviorally divergent methods (the kernel launch path, + * `finalize`). + * + * The migration plan in `.docs/ONBOARD_HOST_COMMON_REFACTOR.md` lays + * out the further extractions (lifecycle / registration / profiling + * init / c_api shims) that will progressively move methods + their + * load-bearing state from the arch subclass into this base. + */ + +#ifndef SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_BASE_H +#define SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_BASE_H + +#include + +#include "device_arena.h" +#include "host/memory_allocator.h" + +/** + * Common base class for both a2a3 and a5 onboard `DeviceRunner`s. + * + * Ctor + dtor are `protected` so this class can only be used as a base; + * direct instantiation and `delete` through a base pointer are both + * compile errors. The arch subclass's `DeviceRunner` is what + * `destroy_device_context` sees, so the non-virtual `~DeviceRunnerBase` + * is safe — it never runs as a virtual base destructor. + */ +class DeviceRunnerBase { +public: + DeviceRunnerBase(const DeviceRunnerBase &) = delete; + DeviceRunnerBase &operator=(const DeviceRunnerBase &) = delete; + DeviceRunnerBase(DeviceRunnerBase &&) = delete; + DeviceRunnerBase &operator=(DeviceRunnerBase &&) = delete; + + /** Allocate / free / copy on the per-Worker `MemoryAllocator` + CANN runtime. */ + void *allocate_tensor(std::size_t bytes); + void free_tensor(void *dev_ptr); + int copy_to_device(void *dev_ptr, const void *host_ptr, std::size_t bytes); + int copy_from_device(void *host_ptr, const void *dev_ptr, std::size_t bytes); + + /** + * Return the pooled GM heap / PTO2 SM / runtime arena base pointer. + * `setup_static_arena` (arch subclass) must have already committed + * the relevant region; otherwise returns nullptr. The runtime arena + * accessor is trb-only — hbg's `setup_static_arena(...,0)` leaves + * `runtime_arena_pool_` uncommitted and this returns nullptr. + */ + void *acquire_pooled_gm_heap(); + void *acquire_pooled_gm_sm(); + void *acquire_pooled_runtime_arena(); + +protected: + // Ctor / dtor are protected: this class is for inheritance only — + // direct instantiation (`new DeviceRunnerBase()`) and polymorphic delete + // (`delete (DeviceRunnerBase *)p`) are both compile errors. + DeviceRunnerBase(); + ~DeviceRunnerBase() = default; + + /** + * `DeviceArena` callback trampolines bridging from C-style + * `void *(void *ctx, size_t)` / `void (void *ctx, void *)` to the + * `MemoryAllocator` member function calls. The `ctx` opaque pointer + * passed at arena construction time is `&mem_alloc_`. + */ + static void *arena_alloc_trampoline(void *ctx, std::size_t size) { + return static_cast(ctx)->alloc(size); + } + static void arena_free_trampoline(void *ctx, void *p) { static_cast(ctx)->free(p); } + + MemoryAllocator mem_alloc_; + DeviceArena gm_heap_arena_; + DeviceArena gm_sm_arena_; + DeviceArena runtime_arena_pool_; +}; + +#endif // SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_BASE_H