Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/a2a3/platform/onboard/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp"
)
# Add common/platform/onboard/host sources (shared between a2a3 / a5 onboard).
# Currently scaffolding only — real content moves here in follow-up PRs
# (see .docs/ONBOARD_HOST_COMMON_REFACTOR.md).
# See .docs/ONBOARD_HOST_COMMON_REFACTOR.md for the migration plan.
list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_helpers.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_base.cpp"
)
if(DEFINED CUSTOM_SOURCE_DIRS)
foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS})
Expand Down
33 changes: 0 additions & 33 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,23 +161,6 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
return 0;
}

void *DeviceRunner::acquire_pooled_gm_heap() {
if (!gm_heap_arena_.is_committed()) return nullptr;
return gm_heap_arena_.base();
}

void *DeviceRunner::acquire_pooled_gm_sm() {
if (!gm_sm_arena_.is_committed()) return nullptr;
return gm_sm_arena_.base();
}

void *DeviceRunner::acquire_pooled_runtime_arena() {
// hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
// uncommitted — fail loudly if a caller asks for it anyway.
if (!runtime_arena_pool_.is_committed()) return nullptr;
return runtime_arena_pool_.base();
}

std::thread DeviceRunner::create_thread(std::function<void()> fn) {
int dev_id = device_id_;
return std::thread([dev_id, fn = std::move(fn)]() {
Expand Down Expand Up @@ -406,22 +389,6 @@ int DeviceRunner::ensure_binaries_loaded() {
return 0;
}

void *DeviceRunner::allocate_tensor(size_t bytes) { return mem_alloc_.alloc(bytes); }

void DeviceRunner::free_tensor(void *dev_ptr) {
if (dev_ptr != nullptr) {
mem_alloc_.free(dev_ptr);
}
}

int DeviceRunner::copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes) {
return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE);
}

int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes) {
return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST);
}

int DeviceRunner::query_max_block_dim(rtStream_t stream, uint32_t *out_cube, uint32_t *out_vector) {
uint32_t cube_limit = 0, vector_limit = 0;
bool got_limits = (aclrtGetStreamResLimit(stream, ACL_RT_DEV_RES_CUBE_CORE, &cube_limit) == ACL_ERROR_NONE) &&
Expand Down
89 changes: 13 additions & 76 deletions src/a2a3/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "common/platform_config.h"
#include "common/unified_log.h"
#include "device_arena.h"
#include "device_runner_base.h" // common DeviceRunnerBase
#include "device_runner_helpers.h" // common DeviceArgs + KernelArgsHelper
#include "host/function_cache.h"
#include "host/memory_allocator.h"
Expand Down Expand Up @@ -77,12 +78,9 @@ int kernel_args_init_ffts_base_addr(KernelArgsHelper &helper);
* - Coordinated execution of both kernel types
* - Runtime execution workflow
*/
class DeviceRunner {
class DeviceRunner : public DeviceRunnerBase {
public:
DeviceRunner() :
gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
DeviceRunner() = default;
~DeviceRunner();

/**
Expand All @@ -92,66 +90,19 @@ class DeviceRunner {
* on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
* prebuilt runtime arena) — the corresponding arena stays uncommitted.
* Returns 0 on success, -1 on failure.
*/
int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);

/**
* Return the pooled GM heap / PTO2 SM / runtime arena pointer.
* setup_static_arena must have already committed the relevant region;
* otherwise these return nullptr. All pointers are stable for the
* Worker's lifetime; the three underlying device buffers are released
* in `finalize()`.
*
* acquire_pooled_runtime_arena() is trb-only — the runtime arena region
* is only committed when setup_static_arena was called with
* runtime_arena_size > 0. Calling it on the hbg path
* (setup_static_arena(...,0)) returns nullptr (well-defined).
* `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device`,
* and `acquire_pooled_{gm_heap,gm_sm,runtime_arena}` are inherited from
* `DeviceRunnerBase`.
*/
void *acquire_pooled_gm_heap();
void *acquire_pooled_gm_sm();
void *acquire_pooled_runtime_arena();
int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);

/**
* Create a thread bound to this device.
* The thread calls rtSetDevice(device_id) on entry.
*/
std::thread create_thread(std::function<void()> fn);

/**
* Allocate device tensor memory
*
* @param bytes Size of tensor in bytes
* @return Device pointer on success, nullptr on failure
*/
void *allocate_tensor(size_t bytes);

/**
* Free device tensor memory
*
* @param dev_ptr Device pointer to free
*/
void free_tensor(void *dev_ptr);

/**
* Copy data from host to device
*
* @param dev_ptr Device pointer
* @param host_ptr Host pointer
* @param bytes Number of bytes to copy
* @return 0 on success, error code on failure
*/
int copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes);

/**
* Copy data from device to host
*
* @param host_ptr Host pointer
* @param dev_ptr Device pointer
* @param bytes Number of bytes to copy
* @return 0 on success, error code on failure
*/
int copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes);

/**
* Execute a runtime
*
Expand Down Expand Up @@ -510,28 +461,14 @@ class DeviceRunner {
// AICPU op loader — handles dispatcher bootstrap and per-task launches.
host::LoadAicpuOp load_aicpu_op_;

// Memory management
MemoryAllocator mem_alloc_;

// Three independent per-Worker arenas, each backing a single pooled
// region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
// arena). Split out from a single backing allocation because the
// combined size can exceed the device allocator's largest contiguous
// block — three separate device_malloc calls are friendlier than one
// big one. Released explicitly in finalize() before mem_alloc_.finalize()
// so the underlying buffers do not get freed twice.
// `mem_alloc_`, `gm_heap_arena_`, `gm_sm_arena_`, `runtime_arena_pool_`,
// and the alloc/free trampolines are inherited from `DeviceRunnerBase`.
//
// `runtime_arena_pool_` stays unreserved when setup_static_arena was
// invoked with runtime_arena_size == 0 (hbg path).
// Released explicitly in finalize() before mem_alloc_.finalize() so the
// underlying buffers do not get freed twice. `runtime_arena_pool_` stays
// unreserved when setup_static_arena was invoked with
// runtime_arena_size == 0 (hbg path).
//
// Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
static void *arena_alloc_trampoline(void *ctx, size_t size) {
return static_cast<MemoryAllocator *>(ctx)->alloc(size);
}
static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
DeviceArena gm_heap_arena_;
DeviceArena gm_sm_arena_;
DeviceArena runtime_arena_pool_;
// Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
// the same buffer when a later worker init asks for an equal-or-smaller
// layout on an already-committed arena.
Expand Down
4 changes: 2 additions & 2 deletions src/a5/platform/onboard/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp"
)
# Add common/platform/onboard/host sources (shared between a2a3 / a5 onboard).
# Currently scaffolding only — real content moves here in follow-up PRs
# (see .docs/ONBOARD_HOST_COMMON_REFACTOR.md).
# See .docs/ONBOARD_HOST_COMMON_REFACTOR.md for the migration plan.
list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_helpers.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/platform/onboard/host/device_runner_base.cpp"
)
if(DEFINED CUSTOM_SOURCE_DIRS)
foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS})
Expand Down
33 changes: 0 additions & 33 deletions src/a5/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,23 +110,6 @@ int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, siz
return 0;
}

void *DeviceRunner::acquire_pooled_gm_heap() {
if (!gm_heap_arena_.is_committed()) return nullptr;
return gm_heap_arena_.base();
}

void *DeviceRunner::acquire_pooled_gm_sm() {
if (!gm_sm_arena_.is_committed()) return nullptr;
return gm_sm_arena_.base();
}

void *DeviceRunner::acquire_pooled_runtime_arena() {
// hbg calls setup_static_arena(...,0) and leaves runtime_arena_pool_
// uncommitted — fail loudly if a caller asks for it anyway.
if (!runtime_arena_pool_.is_committed()) return nullptr;
return runtime_arena_pool_.base();
}

std::thread DeviceRunner::create_thread(std::function<void()> fn) {
int dev_id = device_id_;
return std::thread([dev_id, fn = std::move(fn)]() {
Expand Down Expand Up @@ -291,22 +274,6 @@ int DeviceRunner::ensure_binaries_loaded() {
return 0;
}

void *DeviceRunner::allocate_tensor(size_t bytes) { return mem_alloc_.alloc(bytes); }

void DeviceRunner::free_tensor(void *dev_ptr) {
if (dev_ptr != nullptr) {
mem_alloc_.free(dev_ptr);
}
}

int DeviceRunner::copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes) {
return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE);
}

int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes) {
return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST);
}

int DeviceRunner::query_max_block_dim(rtStream_t stream, uint32_t *out_cube, uint32_t *out_vector) {
uint32_t cube_limit = 0, vector_limit = 0;
bool got_limits = (aclrtGetStreamResLimit(stream, ACL_RT_DEV_RES_CUBE_CORE, &cube_limit) == ACL_ERROR_NONE) &&
Expand Down
89 changes: 13 additions & 76 deletions src/a5/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "callable.h"
#include "prepare_callable_common.h"
#include "device_arena.h"
#include "device_runner_base.h" // common DeviceRunnerBase
#include "device_runner_helpers.h" // common DeviceArgs + KernelArgsHelper
#include "common/kernel_args.h"
#include "common/memory_barrier.h"
Expand Down Expand Up @@ -69,12 +70,9 @@
* - Coordinated execution of both kernel types
* - Runtime execution workflow
*/
class DeviceRunner {
class DeviceRunner : public DeviceRunnerBase {
public:
DeviceRunner() :
gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_),
runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {}
DeviceRunner() = default;
~DeviceRunner();

/**
Expand All @@ -84,66 +82,19 @@ class DeviceRunner {
* on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
* prebuilt runtime arena) — the corresponding arena stays uncommitted.
* Returns 0 on success, -1 on failure.
*/
int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);

/**
* Return the pooled GM heap / PTO2 SM / runtime arena pointer.
* setup_static_arena must have already committed the relevant region;
* otherwise these return nullptr. All pointers are stable for the
* Worker's lifetime; the three underlying device buffers are released
* in `finalize()`.
*
* acquire_pooled_runtime_arena() is trb-only — the runtime arena region
* is only committed when setup_static_arena was called with
* runtime_arena_size > 0. Calling it on the hbg path
* (setup_static_arena(...,0)) returns nullptr (well-defined).
* `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device`,
* and `acquire_pooled_{gm_heap,gm_sm,runtime_arena}` are inherited from
* `DeviceRunnerBase`.
*/
void *acquire_pooled_gm_heap();
void *acquire_pooled_gm_sm();
void *acquire_pooled_runtime_arena();
int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);

/**
* Create a thread bound to this device.
* The thread calls rtSetDevice(device_id) on entry.
*/
std::thread create_thread(std::function<void()> fn);

/**
* Allocate device tensor memory
*
* @param bytes Size of tensor in bytes
* @return Device pointer on success, nullptr on failure
*/
void *allocate_tensor(size_t bytes);

/**
* Free device tensor memory
*
* @param dev_ptr Device pointer to free
*/
void free_tensor(void *dev_ptr);

/**
* Copy data from host to device
*
* @param dev_ptr Device pointer
* @param host_ptr Host pointer
* @param bytes Number of bytes to copy
* @return 0 on success, error code on failure
*/
int copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes);

/**
* Copy data from device to host
*
* @param host_ptr Host pointer
* @param dev_ptr Device pointer
* @param bytes Number of bytes to copy
* @return 0 on success, error code on failure
*/
int copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes);

/**
* Execute a runtime
*
Expand Down Expand Up @@ -423,28 +374,14 @@ class DeviceRunner {
// AICPU op loader — handles dispatcher bootstrap and per-task launches.
host::LoadAicpuOp load_aicpu_op_;

// Memory management
MemoryAllocator mem_alloc_;

// Three independent per-Worker arenas, each backing a single pooled
// region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime
// arena). Split out from a single backing allocation because the
// combined size can exceed the device allocator's largest contiguous
// block — three separate device_malloc calls are friendlier than one
// big one. Released explicitly in finalize() before mem_alloc_.finalize()
// so the underlying buffers do not get freed twice.
// `mem_alloc_`, `gm_heap_arena_`, `gm_sm_arena_`, `runtime_arena_pool_`,
// and the alloc/free trampolines are inherited from `DeviceRunnerBase`.
//
// `runtime_arena_pool_` stays unreserved when setup_static_arena was
// invoked with runtime_arena_size == 0 (hbg path).
// Released explicitly in finalize() before mem_alloc_.finalize() so the
// underlying buffers do not get freed twice. `runtime_arena_pool_` stays
// unreserved when setup_static_arena was invoked with
// runtime_arena_size == 0 (hbg path).
//
// Trampolines forward DeviceArena's alloc/free calls to mem_alloc_.
static void *arena_alloc_trampoline(void *ctx, size_t size) {
return static_cast<MemoryAllocator *>(ctx)->alloc(size);
}
static void arena_free_trampoline(void *ctx, void *p) { static_cast<MemoryAllocator *>(ctx)->free(p); }
DeviceArena gm_heap_arena_;
DeviceArena gm_sm_arena_;
DeviceArena runtime_arena_pool_;
// Cached sizes for setup_static_arena's "fits" check — avoids re-allocating
// a buffer when a later worker init asks for an equal-or-smaller layout.
size_t cached_gm_heap_size_{0};
Expand Down
Loading
Loading