From 72998141912a0b7caf6563d0e701468342367b23 Mon Sep 17 00:00:00 2001
From: Chao Wang <26245345+ChaoWao@users.noreply.github.com>
Date: Thu, 28 May 2026 14:30:51 +0800
Subject: [PATCH] Refactor: extract DeviceRunnerBase + tensor/arena methods to
 common
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR 2 of the multi-PR refactor described in
`.docs/ONBOARD_HOST_COMMON_REFACTOR.md`. Introduces the
`DeviceRunnerBase` class and moves the first set of line-identical
methods + their load-bearing state from each arch's `DeviceRunner`
into the shared base.

Moves (both archs):

| Symbol | From | To |
| --- | --- | --- |
| `mem_alloc_` (MemoryAllocator) | each arch's DeviceRunner | `DeviceRunnerBase` |
| `gm_heap_arena_`, `gm_sm_arena_`, `runtime_arena_pool_` (DeviceArena × 3) | each arch's DeviceRunner | `DeviceRunnerBase` |
| `arena_alloc_trampoline`, `arena_free_trampoline` (static fns) | each arch's DeviceRunner | `DeviceRunnerBase` |
| `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device` | each arch's DeviceRunner | `DeviceRunnerBase` |
| `acquire_pooled_gm_heap`, `acquire_pooled_gm_sm`, `acquire_pooled_runtime_arena` | each arch's DeviceRunner | `DeviceRunnerBase` |

Each arch's `DeviceRunner` now `: public DeviceRunnerBase`. The arch's
default constructor (`= default`) implicitly chains to the base ctor,
which wires the three arenas to the inherited `mem_alloc_` via the
inherited trampolines. Subclass keeps everything else (streams,
kernel args, profiling collectors, callable registration, etc.) for
now — those move in subsequent PRs (Groups C/D/E in the design doc).

Non-virtual base destructor: the dlsym surface
(`destroy_device_context`) always casts to the arch's concrete
`DeviceRunner` before `delete`, so no polymorphic delete happens via
a base pointer. Documented inline.

ABI / Python surface unchanged.

Local validation (a2a3 onboard, device 1, pinned PTO-ISA):
- `vector_example`: PASS

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/a2a3/platform/onboard/host/CMakeLists.txt |  4 +-
 .../platform/onboard/host/device_runner.cpp   | 33 -------
 .../platform/onboard/host/device_runner.h     | 89 +++--------------
 src/a5/platform/onboard/host/CMakeLists.txt   |  4 +-
 .../platform/onboard/host/device_runner.cpp   | 33 -------
 src/a5/platform/onboard/host/device_runner.h  | 89 +++--------------
 .../onboard/host/device_runner_base.cpp       | 59 +++++++++++
 .../onboard/host/device_runner_base.h         | 99 +++++++++++++++++++
 8 files changed, 188 insertions(+), 222 deletions(-)
 create mode 100644 src/common/platform/onboard/host/device_runner_base.cpp
 create mode 100644 src/common/platform/onboard/host/device_runner_base.h

diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt
index 16abd2e68..ac5f12b9f 100644
--- a/src/a2a3/platform/onboard/host/CMakeLists.txt
+++ b/src/a2a3/platform/onboard/host/CMakeLists.txt
@@ -68,10 +68,10 @@ list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp"
 )
 # Add common/platform/onboard/host sources (shared between a2a3 / a5 onboard).
-# Currently scaffolding only — real content moves here in follow-up PRs
-# (see .docs/ONBOARD_HOST_COMMON_REFACTOR.md).
+# See .docs/ONBOARD_HOST_COMMON_REFACTOR.md for the migration plan.
 list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_helpers.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_base.cpp"
 )
 if(DEFINED CUSTOM_SOURCE_DIRS)
     foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS})
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 74993a79b..f4039e17c 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -161,23 +161,6 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
     return 0;
 }
 
-void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!gm_heap_arena_.is_committed()) return nullptr;
-    return gm_heap_arena_.base();
-}
-
-void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!gm_sm_arena_.is_committed()) return nullptr;
-    return gm_sm_arena_.base();
-}
-
-void *DeviceRunner::acquire_pooled_runtime_arena() {
-    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
-    // uncommitted — fail loudly if a caller asks for it anyway.
-    if (!runtime_arena_pool_.is_committed()) return nullptr;
-    return runtime_arena_pool_.base();
-}
-
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
     int dev_id = device_id_;
     return std::thread([dev_id, fn = std::move(fn)]() {
@@ -406,22 +389,6 @@ int DeviceRunner::ensure_binaries_loaded() {
     return 0;
 }
 
-void *DeviceRunner::allocate_tensor(size_t bytes) { return mem_alloc_.alloc(bytes); }
-
-void DeviceRunner::free_tensor(void *dev_ptr) {
-    if (dev_ptr != nullptr) {
-        mem_alloc_.free(dev_ptr);
-    }
-}
-
-int DeviceRunner::copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes) {
-    return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE);
-}
-
-int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes) {
-    return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST);
-}
-
 int DeviceRunner::query_max_block_dim(rtStream_t stream, uint32_t *out_cube, uint32_t *out_vector) {
     uint32_t cube_limit = 0, vector_limit = 0;
     bool got_limits = (aclrtGetStreamResLimit(stream, ACL_RT_DEV_RES_CUBE_CORE, &cube_limit) == ACL_ERROR_NONE) &&
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index b7934686c..c35a5f65b 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -44,6 +44,7 @@
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 #include "device_arena.h"
+#include "device_runner_base.h"     // common DeviceRunnerBase
 #include "device_runner_helpers.h"  // common DeviceArgs + KernelArgsHelper
 #include "host/function_cache.h"
 #include "host/memory_allocator.h"
@@ -77,12 +78,9 @@ int kernel_args_init_ffts_base_addr(KernelArgsHelper &helper);
  * - Coordinated execution of both kernel types
  * - Runtime execution workflow
  */
-class DeviceRunner {
+class DeviceRunner : public DeviceRunnerBase {
 public:
-    DeviceRunner() :
-        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
-        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
-        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+    DeviceRunner() = default;
     ~DeviceRunner();
 
     /**
@@ -92,24 +90,12 @@ class DeviceRunner {
      * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
      * prebuilt runtime arena) — the corresponding arena stays uncommitted.
      * Returns 0 on success, -1 on failure.
-     */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
-
-    /**
-     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
-     * setup_static_arena must have already committed the relevant region;
-     * otherwise these return nullptr. All pointers are stable for the
-     * Worker's lifetime; the three underlying device buffers are released
-     * in `finalize()`.
      *
-     * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
-     * is only committed when setup_static_arena was called with
-     * runtime_arena_size > 0. Calling it on the hbg path
-     * (setup_static_arena(...,0)) returns nullptr (well-defined).
+     * `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device`,
+     * and `acquire_pooled_{gm_heap,gm_sm,runtime_arena}` are inherited from
+     * `DeviceRunnerBase`.
      */
-    void *acquire_pooled_gm_heap();
-    void *acquire_pooled_gm_sm();
-    void *acquire_pooled_runtime_arena();
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
      * Create a thread bound to this device.
@@ -117,41 +103,6 @@ class DeviceRunner {
      */
     std::thread create_thread(std::function<void()> fn);
 
-    /**
-     * Allocate device tensor memory
-     *
-     * @param bytes  Size of tensor in bytes
-     * @return Device pointer on success, nullptr on failure
-     */
-    void *allocate_tensor(size_t bytes);
-
-    /**
-     * Free device tensor memory
-     *
-     * @param dev_ptr  Device pointer to free
-     */
-    void free_tensor(void *dev_ptr);
-
-    /**
-     * Copy data from host to device
-     *
-     * @param dev_ptr   Device pointer
-     * @param host_ptr  Host pointer
-     * @param bytes    Number of bytes to copy
-     * @return 0 on success, error code on failure
-     */
-    int copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes);
-
-    /**
-     * Copy data from device to host
-     *
-     * @param host_ptr  Host pointer
-     * @param dev_ptr   Device pointer
-     * @param bytes    Number of bytes to copy
-     * @return 0 on success, error code on failure
-     */
-    int copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes);
-
     /**
      * Execute a runtime
      *
@@ -510,28 +461,14 @@ class DeviceRunner {
     // AICPU op loader — handles dispatcher bootstrap and per-task launches.
     host::LoadAicpuOp load_aicpu_op_;
 
-    // Memory management
-    MemoryAllocator mem_alloc_;
-
-    // Three independent per-Worker arenas, each backing a single pooled
-    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
-    // arena). Split out from a single backing allocation because the
-    // combined size can exceed the device allocator's largest contiguous
-    // block — three separate device_malloc calls are friendlier than one
-    // big one. Released explicitly in finalize() before mem_alloc_.finalize()
-    // so the underlying buffers do not get freed twice.
+    // `mem_alloc_`, `gm_heap_arena_`, `gm_sm_arena_`, `runtime_arena_pool_`,
+    // and the alloc/free trampolines are inherited from `DeviceRunnerBase`.
     //
-    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
-    // invoked with runtime_arena_size == 0 (hbg path).
+    // Released explicitly in finalize() before mem_alloc_.finalize() so the
+    // underlying buffers do not get freed twice. `runtime_arena_pool_` stays
+    // unreserved when setup_static_arena was invoked with
+    // runtime_arena_size == 0 (hbg path).
     //
-    // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
-    static void *arena_alloc_trampoline(void *ctx, size_t size) {
-        return static_cast<MemoryAllocator *>(ctx)->alloc(size);
-    }
-    static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena gm_heap_arena_;
-    DeviceArena gm_sm_arena_;
-    DeviceArena runtime_arena_pool_;
     // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
     // the same buffer when a later worker init asks for an equal-or-smaller
     // layout on an already-committed arena.
diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt
index eec7e4afd..b7099ff57 100644
--- a/src/a5/platform/onboard/host/CMakeLists.txt
+++ b/src/a5/platform/onboard/host/CMakeLists.txt
@@ -53,10 +53,10 @@ list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp"
 )
 # Add common/platform/onboard/host sources (shared between a2a3 / a5 onboard).
-# Currently scaffolding only — real content moves here in follow-up PRs
-# (see .docs/ONBOARD_HOST_COMMON_REFACTOR.md).
+# See .docs/ONBOARD_HOST_COMMON_REFACTOR.md for the migration plan.
 list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_helpers.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_base.cpp"
 )
 if(DEFINED CUSTOM_SOURCE_DIRS)
     foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS})
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 614ca180d..4e35dfdfb 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -110,23 +110,6 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
     return 0;
 }
 
-void *DeviceRunner::acquire_pooled_gm_heap() {
-    if (!gm_heap_arena_.is_committed()) return nullptr;
-    return gm_heap_arena_.base();
-}
-
-void *DeviceRunner::acquire_pooled_gm_sm() {
-    if (!gm_sm_arena_.is_committed()) return nullptr;
-    return gm_sm_arena_.base();
-}
-
-void *DeviceRunner::acquire_pooled_runtime_arena() {
-    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
-    // uncommitted — fail loudly if a caller asks for it anyway.
-    if (!runtime_arena_pool_.is_committed()) return nullptr;
-    return runtime_arena_pool_.base();
-}
-
 std::thread DeviceRunner::create_thread(std::function<void()> fn) {
     int dev_id = device_id_;
     return std::thread([dev_id, fn = std::move(fn)]() {
@@ -291,22 +274,6 @@ int DeviceRunner::ensure_binaries_loaded() {
     return 0;
 }
 
-void *DeviceRunner::allocate_tensor(size_t bytes) { return mem_alloc_.alloc(bytes); }
-
-void DeviceRunner::free_tensor(void *dev_ptr) {
-    if (dev_ptr != nullptr) {
-        mem_alloc_.free(dev_ptr);
-    }
-}
-
-int DeviceRunner::copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes) {
-    return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE);
-}
-
-int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes) {
-    return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST);
-}
-
 int DeviceRunner::query_max_block_dim(rtStream_t stream, uint32_t *out_cube, uint32_t *out_vector) {
     uint32_t cube_limit = 0, vector_limit = 0;
     bool got_limits = (aclrtGetStreamResLimit(stream, ACL_RT_DEV_RES_CUBE_CORE, &cube_limit) == ACL_ERROR_NONE) &&
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index 2faa47d6c..f0e5062c4 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -39,6 +39,7 @@
 #include "callable.h"
 #include "prepare_callable_common.h"
 #include "device_arena.h"
+#include "device_runner_base.h"     // common DeviceRunnerBase
 #include "device_runner_helpers.h"  // common DeviceArgs + KernelArgsHelper
 #include "common/kernel_args.h"
 #include "common/memory_barrier.h"
@@ -69,12 +70,9 @@
  * - Coordinated execution of both kernel types
  * - Runtime execution workflow
  */
-class DeviceRunner {
+class DeviceRunner : public DeviceRunnerBase {
 public:
-    DeviceRunner() :
-        gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
-        gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
-        runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+    DeviceRunner() = default;
     ~DeviceRunner();
 
     /**
@@ -84,24 +82,12 @@ class DeviceRunner {
      * on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
      * prebuilt runtime arena) — the corresponding arena stays uncommitted.
      * Returns 0 on success, -1 on failure.
-     */
-    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
-
-    /**
-     * Return the pooled GM heap / PTO2 SM / runtime arena pointer.
-     * setup_static_arena must have already committed the relevant region;
-     * otherwise these return nullptr. All pointers are stable for the
-     * Worker's lifetime; the three underlying device buffers are released
-     * in `finalize()`.
      *
-     * acquire_pooled_runtime_arena() is trb-only — the runtime arena region
-     * is only committed when setup_static_arena was called with
-     * runtime_arena_size > 0. Calling it on the hbg path
-     * (setup_static_arena(...,0)) returns nullptr (well-defined).
+     * `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device`,
+     * and `acquire_pooled_{gm_heap,gm_sm,runtime_arena}` are inherited from
+     * `DeviceRunnerBase`.
      */
-    void *acquire_pooled_gm_heap();
-    void *acquire_pooled_gm_sm();
-    void *acquire_pooled_runtime_arena();
+    int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
 
     /**
      * Create a thread bound to this device.
@@ -109,41 +95,6 @@ class DeviceRunner {
      */
     std::thread create_thread(std::function<void()> fn);
 
-    /**
-     * Allocate device tensor memory
-     *
-     * @param bytes  Size of tensor in bytes
-     * @return Device pointer on success, nullptr on failure
-     */
-    void *allocate_tensor(size_t bytes);
-
-    /**
-     * Free device tensor memory
-     *
-     * @param dev_ptr  Device pointer to free
-     */
-    void free_tensor(void *dev_ptr);
-
-    /**
-     * Copy data from host to device
-     *
-     * @param dev_ptr   Device pointer
-     * @param host_ptr  Host pointer
-     * @param bytes    Number of bytes to copy
-     * @return 0 on success, error code on failure
-     */
-    int copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes);
-
-    /**
-     * Copy data from device to host
-     *
-     * @param host_ptr  Host pointer
-     * @param dev_ptr   Device pointer
-     * @param bytes    Number of bytes to copy
-     * @return 0 on success, error code on failure
-     */
-    int copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes);
-
     /**
      * Execute a runtime
      *
@@ -423,28 +374,14 @@ class DeviceRunner {
     // AICPU op loader — handles dispatcher bootstrap and per-task launches.
     host::LoadAicpuOp load_aicpu_op_;
 
-    // Memory management
-    MemoryAllocator mem_alloc_;
-
-    // Three independent per-Worker arenas, each backing a single pooled
-    // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
-    // arena). Split out from a single backing allocation because the
-    // combined size can exceed the device allocator's largest contiguous
-    // block — three separate device_malloc calls are friendlier than one
-    // big one. Released explicitly in finalize() before mem_alloc_.finalize()
-    // so the underlying buffers do not get freed twice.
+    // `mem_alloc_`, `gm_heap_arena_`, `gm_sm_arena_`, `runtime_arena_pool_`,
+    // and the alloc/free trampolines are inherited from `DeviceRunnerBase`.
     //
-    // `runtime_arena_pool_` stays unreserved when setup_static_arena was
-    // invoked with runtime_arena_size == 0 (hbg path).
+    // Released explicitly in finalize() before mem_alloc_.finalize() so the
+    // underlying buffers do not get freed twice. `runtime_arena_pool_` stays
+    // unreserved when setup_static_arena was invoked with
+    // runtime_arena_size == 0 (hbg path).
     //
-    // Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
-    static void *arena_alloc_trampoline(void *ctx, size_t size) {
-        return static_cast<MemoryAllocator *>(ctx)->alloc(size);
-    }
-    static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
-    DeviceArena gm_heap_arena_;
-    DeviceArena gm_sm_arena_;
-    DeviceArena runtime_arena_pool_;
     // Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
     // a buffer when a later worker init asks for an equal-or-smaller layout.
     size_t cached_gm_heap_size_{0};
diff --git a/src/common/platform/onboard/host/device_runner_base.cpp b/src/common/platform/onboard/host/device_runner_base.cpp
new file mode 100644
index 000000000..4a71ce07f
--- /dev/null
+++ b/src/common/platform/onboard/host/device_runner_base.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * `DeviceRunnerBase` — tensor-memory wrappers + pooled arena accessors.
+ *
+ * Constructor wires the three arenas to call back into `mem_alloc_` via
+ * the static trampolines declared in the header. Per-region commit is
+ * still driven by the subclass's `setup_static_arena`.
+ */
+
+#include "device_runner_base.h"
+
+#include <runtime/rt.h>
+
+DeviceRunnerBase::DeviceRunnerBase() :
+    gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+    gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
+    runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
+
+void *DeviceRunnerBase::allocate_tensor(std::size_t bytes) { return mem_alloc_.alloc(bytes); }
+
+void DeviceRunnerBase::free_tensor(void *dev_ptr) {
+    if (dev_ptr != nullptr) {
+        mem_alloc_.free(dev_ptr);
+    }
+}
+
+int DeviceRunnerBase::copy_to_device(void *dev_ptr, const void *host_ptr, std::size_t bytes) {
+    return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE);
+}
+
+int DeviceRunnerBase::copy_from_device(void *host_ptr, const void *dev_ptr, std::size_t bytes) {
+    return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST);
+}
+
+void *DeviceRunnerBase::acquire_pooled_gm_heap() {
+    if (!gm_heap_arena_.is_committed()) return nullptr;
+    return gm_heap_arena_.base();
+}
+
+void *DeviceRunnerBase::acquire_pooled_gm_sm() {
+    if (!gm_sm_arena_.is_committed()) return nullptr;
+    return gm_sm_arena_.base();
+}
+
+void *DeviceRunnerBase::acquire_pooled_runtime_arena() {
+    // hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
+    // uncommitted — fail loudly if a caller asks for it anyway.
+    if (!runtime_arena_pool_.is_committed()) return nullptr;
+    return runtime_arena_pool_.base();
+}
diff --git a/src/common/platform/onboard/host/device_runner_base.h b/src/common/platform/onboard/host/device_runner_base.h
new file mode 100644
index 000000000..85b55ad51
--- /dev/null
+++ b/src/common/platform/onboard/host/device_runner_base.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Onboard host `DeviceRunnerBase` — common base class for a2a3 and a5
+ * onboard `DeviceRunner`s.
+ *
+ * This module owns the host-side state and methods that are identical
+ * between the two onboard arches today:
+ *   - The `MemoryAllocator` and the three `DeviceArena`s (gm heap, PTO2
+ *     SM, runtime arena) backing the per-Worker pooled regions.
+ *   - The trivial tensor-memory wrappers (`allocate_tensor`,
+ *     `free_tensor`, `copy_*_device`).
+ *   - The arena-pool accessors (`acquire_pooled_gm_heap`, etc.).
+ *
+ * Subclasses (`{a2a3,a5}::DeviceRunner`) add arch-specific state
+ * (streams, kernel args, profiling collectors, callable registration)
+ * and override behaviorally divergent methods (the kernel launch path,
+ * `finalize`).
+ *
+ * The migration plan in `.docs/ONBOARD_HOST_COMMON_REFACTOR.md` lays
+ * out the further extractions (lifecycle / registration / profiling
+ * init / c_api shims) that will progressively move methods + their
+ * load-bearing state from the arch subclass into this base.
+ */
+
+#ifndef SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_BASE_H
+#define SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_BASE_H
+
+#include <cstddef>
+
+#include "device_arena.h"
+#include "host/memory_allocator.h"
+
+/**
+ * Common base class for both a2a3 and a5 onboard `DeviceRunner`s.
+ *
+ * Ctor + dtor are `protected` so this class can only be used as a base;
+ * direct instantiation and `delete` through a base pointer are both
+ * compile errors. The arch subclass's `DeviceRunner` is what
+ * `destroy_device_context` sees, so the non-virtual `~DeviceRunnerBase`
+ * is safe — it never runs as a virtual base destructor.
+ */
+class DeviceRunnerBase {
+public:
+    DeviceRunnerBase(const DeviceRunnerBase &) = delete;
+    DeviceRunnerBase &operator=(const DeviceRunnerBase &) = delete;
+    DeviceRunnerBase(DeviceRunnerBase &&) = delete;
+    DeviceRunnerBase &operator=(DeviceRunnerBase &&) = delete;
+
+    /** Allocate / free / copy on the per-Worker `MemoryAllocator` + CANN runtime. */
+    void *allocate_tensor(std::size_t bytes);
+    void free_tensor(void *dev_ptr);
+    int copy_to_device(void *dev_ptr, const void *host_ptr, std::size_t bytes);
+    int copy_from_device(void *host_ptr, const void *dev_ptr, std::size_t bytes);
+
+    /**
+     * Return the pooled GM heap / PTO2 SM / runtime arena base pointer.
+     * `setup_static_arena` (arch subclass) must have already committed
+     * the relevant region; otherwise returns nullptr. The runtime arena
+     * accessor is trb-only — hbg's `setup_static_arena(...,0)` leaves
+     * `runtime_arena_pool_` uncommitted and this returns nullptr.
+     */
+    void *acquire_pooled_gm_heap();
+    void *acquire_pooled_gm_sm();
+    void *acquire_pooled_runtime_arena();
+
+protected:
+    // Ctor / dtor are protected: this class is for inheritance only —
+    // direct instantiation (`new DeviceRunnerBase()`) and polymorphic delete
+    // (`delete (DeviceRunnerBase *)p`) are both compile errors.
+    DeviceRunnerBase();
+    ~DeviceRunnerBase() = default;
+
+    /**
+     * `DeviceArena` callback trampolines bridging from C-style
+     * `void *(void *ctx, size_t)` / `void (void *ctx, void *)` to the
+     * `MemoryAllocator` member function calls. The `ctx` opaque pointer
+     * passed at arena construction time is `&mem_alloc_`.
+     */
+    static void *arena_alloc_trampoline(void *ctx, std::size_t size) {
+        return static_cast<MemoryAllocator *>(ctx)->alloc(size);
+    }
+    static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
+
+    MemoryAllocator mem_alloc_;
+    DeviceArena gm_heap_arena_;
+    DeviceArena gm_sm_arena_;
+    DeviceArena runtime_arena_pool_;
+};
+
+#endif  // SIMPLER_COMMON_PLATFORM_ONBOARD_HOST_DEVICE_RUNNER_BASE_H