diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py index ca0b30bf5..985f3955d 100644 --- a/examples/scripts/code_runner.py +++ b/examples/scripts/code_runner.py @@ -311,14 +311,23 @@ def _ensure_pto_isa_root(verbose: bool = False, commit: Optional[str] = None) -> if verbose: logger.info("PTO_ISA_ROOT not set, cloning pto-isa repository...") if not _clone_pto_isa(verbose=verbose, commit=commit): + # Another parallel process may have completed the clone + if not _is_pto_isa_cloned(): + if verbose: + logger.warning("Failed to automatically clone pto-isa.") + logger.warning("You can manually clone it with:") + logger.warning(f" mkdir -p {clone_path.parent}") + logger.warning(f" git clone {_PTO_ISA_REPO} {clone_path}") + logger.warning("Or set PTO_ISA_ROOT to an existing pto-isa installation:") + logger.warning(" export PTO_ISA_ROOT=/path/to/pto-isa") + return None if verbose: - logger.warning("Failed to automatically clone pto-isa.") - logger.warning("You can manually clone it with:") - logger.warning(f" mkdir -p {clone_path.parent}") - logger.warning(f" git clone {_PTO_ISA_REPO} {clone_path}") - logger.warning("Or set PTO_ISA_ROOT to an existing pto-isa installation:") - logger.warning(" export PTO_ISA_ROOT=/path/to/pto-isa") - return None + logger.info("pto-isa already cloned by another process") + # Recovered from race — apply commit/update below + if commit: + _checkout_pto_isa_commit(clone_path, commit, verbose=verbose) + else: + _update_pto_isa_to_latest(clone_path, verbose=verbose) elif commit: _checkout_pto_isa_commit(clone_path, commit, verbose=verbose) else: diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 5077335f4..282bd7329 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -495,7 +495,17 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); - // Clear kernel address mapping + // Kernel binaries should have been removed by validate_runtime_impl() + if (!func_id_to_addr_.empty()) { + LOG_ERROR("finalize() called with %zu kernel binaries still cached (memory leak)", + func_id_to_addr_.size()); + // Cleanup leaked binaries to prevent memory leaks + for (const auto& pair : func_id_to_addr_) { + void* gm_addr = reinterpret_cast(pair.second); + mem_alloc_.free(gm_addr); + LOG_DEBUG("Freed leaked kernel binary: func_id=%d, addr=0x%lx", pair.first, pair.second); + } + } func_id_to_addr_.clear(); binaries_loaded_ = false; @@ -631,32 +641,23 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data LOG_DEBUG("Uploading kernel binary: func_id=%d, size=%zu bytes", func_id, bin_size); - // Allocate device GM memory (size field + binary data) - uint64_t alloc_size = sizeof(uint64_t) + bin_size; - void* gm_addr = mem_alloc_.alloc(alloc_size); + // Allocate device GM memory for kernel binary + void* gm_addr = mem_alloc_.alloc(bin_size); if (gm_addr == nullptr) { LOG_ERROR("Failed to allocate device GM memory for kernel func_id=%d", func_id); return 0; } - // Build host buffer with CoreFunctionBin structure (size + data) - std::vector host_buf(alloc_size); - uint64_t* size_ptr = reinterpret_cast(host_buf.data()); - *size_ptr = bin_size; - std::memcpy(host_buf.data() + sizeof(uint64_t), bin_data, bin_size); - - // Copy to device - int rc = rtMemcpy(gm_addr, alloc_size, host_buf.data(), alloc_size, RT_MEMCPY_HOST_TO_DEVICE); + // Copy kernel binary to device + int rc = rtMemcpy(gm_addr, bin_size, bin_data, bin_size, RT_MEMCPY_HOST_TO_DEVICE); if (rc != 0) { LOG_ERROR("rtMemcpy to device failed: %d", rc); mem_alloc_.free(gm_addr); return 0; } - // Calculate function_bin_addr (skip size field to get actual code address) - uint64_t function_bin_addr = reinterpret_cast(gm_addr) + sizeof(uint64_t); - - // Cache for later reuse and cleanup + // Cache the kernel address + uint64_t function_bin_addr = reinterpret_cast(gm_addr); func_id_to_addr_[func_id] = function_bin_addr; LOG_DEBUG(" func_id=%d -> function_bin_addr=0x%lx", func_id, function_bin_addr); @@ -664,6 +665,21 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data return function_bin_addr; } +void DeviceRunner::remove_kernel_binary(int func_id) { + auto it = func_id_to_addr_.find(func_id); + if (it == func_id_to_addr_.end()) { + return; + } + + uint64_t function_bin_addr = it->second; + void* gm_addr = reinterpret_cast(function_bin_addr); + + mem_alloc_.free(gm_addr); + func_id_to_addr_.erase(it); + + LOG_DEBUG("Removed kernel binary: func_id=%d, addr=0x%lx", func_id, function_bin_addr); +} + int DeviceRunner::init_performance_profiling(Runtime& runtime, int num_aicore, int device_id) { // Define allocation callback (a2a3: use MemoryAllocator) auto alloc_cb = [](size_t size, void* user_data) -> void* { diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 88a0a36ec..8c6f454cd 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -305,6 +305,16 @@ class DeviceRunner { */ uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size); + /** + * Remove a kernel binary from device memory + * + * Frees the device memory allocated for the kernel and removes the + * cached entry. This should be called during per-case cleanup. + * + * @param func_id Function identifier to remove + */ + void remove_kernel_binary(int func_id); + /** * Ensure device is set and streams are created (minimal initialization) * diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index f319dcc5f..e1e3b151a 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -38,6 +38,7 @@ void device_free(void* dev_ptr); int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size); int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size); uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size); +void remove_kernel_binary_wrapper(int func_id); /* =========================================================================== */ @@ -75,6 +76,7 @@ int init_runtime(RuntimeHandle runtime, r->host_api.copy_to_device = copy_to_device; r->host_api.copy_from_device = copy_from_device; r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; + r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; LOG_DEBUG("About to call init_runtime_impl, r=%p", (void*)r); @@ -157,6 +159,15 @@ uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size } } +void remove_kernel_binary_wrapper(int func_id) { + try { + DeviceRunner& runner = DeviceRunner::get(); + runner.remove_kernel_binary(func_id); + } catch (...) { + // Ignore errors during cleanup + } +} + int launch_runtime(RuntimeHandle runtime, int aicpu_thread_num, int block_dim, diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index cf9e9f510..483cca6f5 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -342,14 +342,17 @@ int DeviceRunner::finalize() { perf_collector_.finalize(nullptr, free_cb, nullptr); } - // Close all dlopen'd kernel libraries - for (auto& pair : func_id_to_addr_) { - MappedKernel& kernel = pair.second; - if (kernel.dl_handle != nullptr) { - dlclose(kernel.dl_handle); - LOG_DEBUG("Closed dlopen kernel: func_id=%d", pair.first); - kernel.dl_handle = nullptr; - kernel.func_addr = 0; + // Kernel binaries should have been removed by validate_runtime_impl() + if (!func_id_to_addr_.empty()) { + LOG_ERROR("finalize() called with %zu kernel binaries still cached", + func_id_to_addr_.size()); + // Cleanup leaked handles + for (auto& pair : func_id_to_addr_) { + MappedKernel& kernel = pair.second; + if (kernel.dl_handle != nullptr) { + dlclose(kernel.dl_handle); + LOG_DEBUG("Closed leaked kernel: func_id=%d", pair.first); + } } } func_id_to_addr_.clear(); @@ -450,6 +453,21 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data return kernel.func_addr; } +void DeviceRunner::remove_kernel_binary(int func_id) { + auto it = func_id_to_addr_.find(func_id); + if (it == func_id_to_addr_.end()) { + return; + } + + MappedKernel& kernel = it->second; + if (kernel.dl_handle != nullptr) { + dlclose(kernel.dl_handle); + LOG_DEBUG("Removed kernel binary (dlclose): func_id=%d, handle=%p", func_id, kernel.dl_handle); + } + + func_id_to_addr_.erase(it); +} + // ============================================================================= // Performance Profiling Implementation // ============================================================================= diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 6172fa1a9..b88b927ac 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -182,6 +182,16 @@ class DeviceRunner { */ uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size); + /** + * Remove a kernel binary from memory + * + * Closes the dlopen handle and removes the cached entry. + * This should be called during per-case cleanup. + * + * @param func_id Function identifier to remove + */ + void remove_kernel_binary(int func_id); + private: DeviceRunner() = default; ~DeviceRunner(); diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 90ae2013e..a16728128 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -41,6 +41,7 @@ void device_free(void* dev_ptr); int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size); int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size); uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size); +void remove_kernel_binary_wrapper(int func_id); /* =========================================================================== * Runtime API Implementation @@ -79,6 +80,7 @@ int init_runtime(RuntimeHandle runtime, r->host_api.copy_to_device = copy_to_device; r->host_api.copy_from_device = copy_from_device; r->host_api.upload_kernel_binary = upload_kernel_binary_wrapper; + r->host_api.remove_kernel_binary = remove_kernel_binary_wrapper; // Delegate kernel registration, SO loading, and orchestration to init_runtime_impl int result = init_runtime_impl(r, orch_so_binary, orch_so_size, @@ -156,6 +158,15 @@ uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size } } +void remove_kernel_binary_wrapper(int func_id) { + try { + DeviceRunner& runner = DeviceRunner::get(); + runner.remove_kernel_binary(func_id); + } catch (...) { + // Ignore errors during cleanup + } +} + int launch_runtime(RuntimeHandle runtime, int aicpu_thread_num, int block_dim, diff --git a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp index 831ba5d7d..e1fb639d2 100644 --- a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp +++ b/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp @@ -310,14 +310,9 @@ int validate_runtime_impl(Runtime* runtime) { int kernel_count = runtime->get_registered_kernel_count(); for (int i = 0; i < kernel_count; i++) { int func_id = runtime->get_registered_kernel_func_id(i); - uint64_t addr = runtime->get_function_bin_addr(func_id); - if (addr != 0) { - // Kernel binary is stored at (addr - sizeof(uint64_t)) - void* gm_addr = reinterpret_cast(addr - sizeof(uint64_t)); - runtime->host_api.device_free(gm_addr); - runtime->set_function_bin_addr(func_id, 0); - kernel_freed++; - } + runtime->host_api.remove_kernel_binary(func_id); + runtime->set_function_bin_addr(func_id, 0); + kernel_freed++; } if (kernel_freed > 0) { std::cout << "Freed " << kernel_freed << " kernel binaries\n"; diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h index bf7b4dca6..86b54eebb 100644 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h @@ -177,6 +177,7 @@ struct HostApi { int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size); int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size); uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size); + void (*remove_kernel_binary)(int func_id); }; /** diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp index a2ede8d29..91db0ca0d 100644 --- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp @@ -212,12 +212,8 @@ int validate_runtime_impl(Runtime *runtime) { int kernel_count = runtime->get_registered_kernel_count(); for (int i = 0; i < kernel_count; i++) { int func_id = runtime->get_registered_kernel_func_id(i); - uint64_t addr = runtime->get_function_bin_addr(func_id); - if (addr != 0) { - void* gm_addr = reinterpret_cast(addr - sizeof(uint64_t)); - runtime->host_api.device_free(gm_addr); - runtime->set_function_bin_addr(func_id, 0); - } + runtime->host_api.remove_kernel_binary(func_id); + runtime->set_function_bin_addr(func_id, 0); } if (kernel_count > 0) { LOG_INFO("Freed %d kernel binaries", kernel_count); diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index a3306bc1d..2271d6a99 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -132,6 +132,7 @@ struct HostApi { int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size); int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size); uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size); + void (*remove_kernel_binary)(int func_id); }; /** diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index e4cf30ea3..68aeea048 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -414,12 +414,8 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { int kernel_count = runtime->get_registered_kernel_count(); for (int i = 0; i < kernel_count; i++) { int func_id = runtime->get_registered_kernel_func_id(i); - uint64_t addr = runtime->get_function_bin_addr(func_id); - if (addr != 0) { - void* gm_addr = reinterpret_cast(addr - sizeof(uint64_t)); - runtime->host_api.device_free(gm_addr); - runtime->set_function_bin_addr(func_id, 0); - } + runtime->host_api.remove_kernel_binary(func_id); + runtime->set_function_bin_addr(func_id, 0); } if (kernel_count > 0) { LOG_INFO("Freed %d kernel binaries", kernel_count); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index edafc6646..6c1402a73 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -108,6 +108,7 @@ struct HostApi { int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size); int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size); uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size); + void (*remove_kernel_binary)(int func_id); }; /** diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp index 7b2798678..563795a53 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp +++ b/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp @@ -56,7 +56,6 @@ PTO2OrchestrationConfig aicpu_orchestration_config(uint64_t* args, int arg_count __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) { (void)arg_count; - pto2_rt_init_tensor_pool(rt); void* dev_A = (void*)(uintptr_t)args[ARG_PTR_A]; void* dev_B = (void*)(uintptr_t)args[ARG_PTR_B];