Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions examples/scripts/code_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,14 +311,23 @@ def _ensure_pto_isa_root(verbose: bool = False, commit: Optional[str] = None) ->
if verbose:
logger.info("PTO_ISA_ROOT not set, cloning pto-isa repository...")
if not _clone_pto_isa(verbose=verbose, commit=commit):
# Another parallel process may have completed the clone
if not _is_pto_isa_cloned():
if verbose:
logger.warning("Failed to automatically clone pto-isa.")
logger.warning("You can manually clone it with:")
logger.warning(f" mkdir -p {clone_path.parent}")
logger.warning(f" git clone {_PTO_ISA_REPO} {clone_path}")
logger.warning("Or set PTO_ISA_ROOT to an existing pto-isa installation:")
logger.warning(" export PTO_ISA_ROOT=/path/to/pto-isa")
return None
if verbose:
logger.warning("Failed to automatically clone pto-isa.")
logger.warning("You can manually clone it with:")
logger.warning(f" mkdir -p {clone_path.parent}")
logger.warning(f" git clone {_PTO_ISA_REPO} {clone_path}")
logger.warning("Or set PTO_ISA_ROOT to an existing pto-isa installation:")
logger.warning(" export PTO_ISA_ROOT=/path/to/pto-isa")
return None
logger.info("pto-isa already cloned by another process")
# Recovered from race — apply commit/update below
if commit:
_checkout_pto_isa_commit(clone_path, commit, verbose=verbose)
else:
_update_pto_isa_to_latest(clone_path, verbose=verbose)
elif commit:
_checkout_pto_isa_commit(clone_path, commit, verbose=verbose)
else:
Expand Down
48 changes: 32 additions & 16 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,17 @@ int DeviceRunner::finalize() {
// Cleanup AICPU SO
so_info_.finalize();

// Clear kernel address mapping
// Kernel binaries should have been removed by validate_runtime_impl()
if (!func_id_to_addr_.empty()) {
LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)",
func_id_to_addr_.size());
// Cleanup leaked binaries to prevent memory leaks
for (const auto& pair : func_id_to_addr_) {
void* gm_addr = reinterpret_cast<void*>(pair.second);
mem_alloc_.free(gm_addr);
LOG_DEBUG("Freed leaked kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second);
}
}
Comment thread
ChaoWao marked this conversation as resolved.
func_id_to_addr_.clear();
binaries_loaded_ = false;

Expand Down Expand Up @@ -631,39 +641,45 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data

LOG_DEBUG("Uploading kernel binary: func_id=%d, size=%zu bytes", func_id, bin_size);

// Allocate device GM memory (size field + binary data)
uint64_t alloc_size = sizeof(uint64_t) + bin_size;
void* gm_addr = mem_alloc_.alloc(alloc_size);
// Allocate device GM memory for kernel binary
void* gm_addr = mem_alloc_.alloc(bin_size);
if (gm_addr == nullptr) {
LOG_ERROR("Failed to allocate device GM memory for kernel func_id=%d", func_id);
return 0;
}

// Build host buffer with CoreFunctionBin structure (size + data)
std::vector<uint8_t> host_buf(alloc_size);
uint64_t* size_ptr = reinterpret_cast<uint64_t*>(host_buf.data());
*size_ptr = bin_size;
std::memcpy(host_buf.data() + sizeof(uint64_t), bin_data, bin_size);

// Copy to device
int rc = rtMemcpy(gm_addr, alloc_size, host_buf.data(), alloc_size, RT_MEMCPY_HOST_TO_DEVICE);
// Copy kernel binary to device
int rc = rtMemcpy(gm_addr, bin_size, bin_data, bin_size, RT_MEMCPY_HOST_TO_DEVICE);
if (rc != 0) {
LOG_ERROR("rtMemcpy to device failed: %d", rc);
mem_alloc_.free(gm_addr);
return 0;
}

// Calculate function_bin_addr (skip size field to get actual code address)
uint64_t function_bin_addr = reinterpret_cast<uint64_t>(gm_addr) + sizeof(uint64_t);

// Cache for later reuse and cleanup
// Cache the kernel address
uint64_t function_bin_addr = reinterpret_cast<uint64_t>(gm_addr);
func_id_to_addr_[func_id] = function_bin_addr;

LOG_DEBUG(" func_id=%d -> function_bin_addr=0x%lx", func_id, function_bin_addr);

return function_bin_addr;
}

void DeviceRunner::remove_kernel_binary(int func_id) {
auto it = func_id_to_addr_.find(func_id);
if (it == func_id_to_addr_.end()) {
return;
}

uint64_t function_bin_addr = it->second;
void* gm_addr = reinterpret_cast<void*>(function_bin_addr);

mem_alloc_.free(gm_addr);
func_id_to_addr_.erase(it);

LOG_DEBUG("Removed kernel binary: func_id=%d, addr=0x%lx", func_id, function_bin_addr);
}
Comment thread
ChaoWao marked this conversation as resolved.

int DeviceRunner::init_performance_profiling(Runtime& runtime, int num_aicore, int device_id) {
// Define allocation callback (a2a3: use MemoryAllocator)
auto alloc_cb = [](size_t size, void* user_data) -> void* {
Expand Down
10 changes: 10 additions & 0 deletions src/a2a3/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,16 @@ class DeviceRunner {
*/
uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size);

/**
* Remove a kernel binary from device memory
*
* Frees the device memory allocated for the kernel and removes the
* cached entry. This should be called during per-case cleanup.
*
* @param func_id Function identifier to remove
*/
void remove_kernel_binary(int func_id);

/**
* Ensure device is set and streams are created (minimal initialization)
*
Expand Down
11 changes: 11 additions & 0 deletions src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ void device_free(void* dev_ptr);
int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size);
int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size);
uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size);
void remove_kernel_binary_wrapper(int func_id);

/* ===========================================================================
*/
Expand Down Expand Up @@ -75,6 +76,7 @@ int init_runtime(RuntimeHandle runtime,
r->host_api.copy_to_device = copy_to_device;
r->host_api.copy_from_device = copy_from_device;
r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;

LOG_DEBUG("About to call init_runtime_impl, r=%p", (void*)r);

Expand Down Expand Up @@ -157,6 +159,15 @@ uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size
}
}

void remove_kernel_binary_wrapper(int func_id) {
try {
DeviceRunner& runner = DeviceRunner::get();
runner.remove_kernel_binary(func_id);
} catch (...) {
// Ignore errors during cleanup
}
}

int launch_runtime(RuntimeHandle runtime,
int aicpu_thread_num,
int block_dim,
Expand Down
34 changes: 26 additions & 8 deletions src/a2a3/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,14 +342,17 @@ int DeviceRunner::finalize() {
perf_collector_.finalize(nullptr, free_cb, nullptr);
}

// Close all dlopen'd kernel libraries
for (auto& pair : func_id_to_addr_) {
MappedKernel& kernel = pair.second;
if (kernel.dl_handle != nullptr) {
dlclose(kernel.dl_handle);
LOG_DEBUG("Closed dlopen kernel: func_id=%d", pair.first);
kernel.dl_handle = nullptr;
kernel.func_addr = 0;
// Kernel binaries should have been removed by validate_runtime_impl()
if (!func_id_to_addr_.empty()) {
LOG_ERROR("finalize() called with %zu kernel binaries still cached",
func_id_to_addr_.size());
// Cleanup leaked handles
for (auto& pair : func_id_to_addr_) {
MappedKernel& kernel = pair.second;
if (kernel.dl_handle != nullptr) {
dlclose(kernel.dl_handle);
LOG_DEBUG("Closed leaked kernel: func_id=%d", pair.first);
}
}
}
func_id_to_addr_.clear();
Expand Down Expand Up @@ -450,6 +453,21 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data
return kernel.func_addr;
}

void DeviceRunner::remove_kernel_binary(int func_id) {
auto it = func_id_to_addr_.find(func_id);
if (it == func_id_to_addr_.end()) {
return;
}

MappedKernel& kernel = it->second;
if (kernel.dl_handle != nullptr) {
dlclose(kernel.dl_handle);
LOG_DEBUG("Removed kernel binary (dlclose): func_id=%d, handle=%p", func_id, kernel.dl_handle);
}

func_id_to_addr_.erase(it);
}
Comment thread
ChaoWao marked this conversation as resolved.

// =============================================================================
// Performance Profiling Implementation
// =============================================================================
Expand Down
10 changes: 10 additions & 0 deletions src/a2a3/platform/sim/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,16 @@ class DeviceRunner {
*/
uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size);

/**
* Remove a kernel binary from memory
*
* Closes the dlopen handle and removes the cached entry.
* This should be called during per-case cleanup.
*
* @param func_id Function identifier to remove
*/
void remove_kernel_binary(int func_id);

private:
DeviceRunner() = default;
~DeviceRunner();
Expand Down
11 changes: 11 additions & 0 deletions src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ void device_free(void* dev_ptr);
int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size);
int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size);
uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size);
void remove_kernel_binary_wrapper(int func_id);

/* ===========================================================================
* Runtime API Implementation
Expand Down Expand Up @@ -79,6 +80,7 @@ int init_runtime(RuntimeHandle runtime,
r->host_api.copy_to_device = copy_to_device;
r->host_api.copy_from_device = copy_from_device;
r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper;
r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper;

// Delegate kernel registration, SO loading, and orchestration to init_runtime_impl
int result = init_runtime_impl(r, orch_so_binary, orch_so_size,
Expand Down Expand Up @@ -156,6 +158,15 @@ uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size
}
}

void remove_kernel_binary_wrapper(int func_id) {
try {
DeviceRunner& runner = DeviceRunner::get();
runner.remove_kernel_binary(func_id);
} catch (...) {
// Ignore errors during cleanup
}
}

int launch_runtime(RuntimeHandle runtime,
int aicpu_thread_num,
int block_dim,
Expand Down
11 changes: 3 additions & 8 deletions src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,14 +310,9 @@ int validate_runtime_impl(Runtime* runtime) {
int kernel_count = runtime->get_registered_kernel_count();
for (int i = 0; i < kernel_count; i++) {
int func_id = runtime->get_registered_kernel_func_id(i);
uint64_t addr = runtime->get_function_bin_addr(func_id);
if (addr != 0) {
// Kernel binary is stored at (addr - sizeof(uint64_t))
void* gm_addr = reinterpret_cast<void*>(addr - sizeof(uint64_t));
runtime->host_api.device_free(gm_addr);
runtime->set_function_bin_addr(func_id, 0);
kernel_freed++;
}
runtime->host_api.remove_kernel_binary(func_id);
runtime->set_function_bin_addr(func_id, 0);
kernel_freed++;
}
if (kernel_freed > 0) {
std::cout << "Freed " << kernel_freed << " kernel binaries\n";
Expand Down
1 change: 1 addition & 0 deletions src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ struct HostApi {
int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size);
int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size);
uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size);
void (*remove_kernel_binary)(int func_id);
};

/**
Expand Down
8 changes: 2 additions & 6 deletions src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,12 +212,8 @@ int validate_runtime_impl(Runtime *runtime) {
int kernel_count = runtime->get_registered_kernel_count();
for (int i = 0; i < kernel_count; i++) {
int func_id = runtime->get_registered_kernel_func_id(i);
uint64_t addr = runtime->get_function_bin_addr(func_id);
if (addr != 0) {
void* gm_addr = reinterpret_cast<void*>(addr - sizeof(uint64_t));
runtime->host_api.device_free(gm_addr);
runtime->set_function_bin_addr(func_id, 0);
}
runtime->host_api.remove_kernel_binary(func_id);
runtime->set_function_bin_addr(func_id, 0);
}
if (kernel_count > 0) {
LOG_INFO("Freed %d kernel binaries", kernel_count);
Expand Down
1 change: 1 addition & 0 deletions src/a2a3/runtime/host_build_graph/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ struct HostApi {
int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size);
int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size);
uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size);
void (*remove_kernel_binary)(int func_id);
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -414,12 +414,8 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
int kernel_count = runtime->get_registered_kernel_count();
for (int i = 0; i < kernel_count; i++) {
int func_id = runtime->get_registered_kernel_func_id(i);
uint64_t addr = runtime->get_function_bin_addr(func_id);
if (addr != 0) {
void* gm_addr = reinterpret_cast<void*>(addr - sizeof(uint64_t));
runtime->host_api.device_free(gm_addr);
runtime->set_function_bin_addr(func_id, 0);
}
runtime->host_api.remove_kernel_binary(func_id);
runtime->set_function_bin_addr(func_id, 0);
}
if (kernel_count > 0) {
LOG_INFO("Freed %d kernel binaries", kernel_count);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ struct HostApi {
int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size);
int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size);
uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size);
void (*remove_kernel_binary)(int func_id);
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ PTO2OrchestrationConfig aicpu_orchestration_config(uint64_t* args, int arg_count
__attribute__((visibility("default")))
void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
(void)arg_count;
pto2_rt_init_tensor_pool(rt);

void* dev_A = (void*)(uintptr_t)args[ARG_PTR_A];
void* dev_B = (void*)(uintptr_t)args[ARG_PTR_B];
Expand Down
Loading