diff --git a/backends/qualcomm/debugger/README.md b/backends/qualcomm/debugger/README.md index fb8f9a1c662..f383130b068 100644 --- a/backends/qualcomm/debugger/README.md +++ b/backends/qualcomm/debugger/README.md @@ -50,7 +50,7 @@ Generate optrace and QHAS files using QNN tools under $QNN_SDK_ROOT. After finis adb = SimpleADB( qnn_config=qnn_config, pte_path=f"{args.artifact}/{pte_filename}.pte", - workspace=f"/data/local/tmp/executorch/{pte_filename}, + workspace=f"/data/local/tmp/executorch/{pte_filename}", ) binaries_trace = generate_optrace( args, adb, f"{args.artifact}/{pte_filename}.pte", example_input @@ -78,7 +78,7 @@ qairt_visualizer.view(reports=[optrace, qhas]) - `model`: Path to your QNN model file (e.g., `path_to_your_model.dlc`). - **`reports`**: List of report file paths, including the optrace (`optrace.json`) and QHAS (`optrace_qnn_htp_analysis_summary.json`). -Note: Files ending with `.bin ` do not support graph visualization in qairt_visualizer. +Note: Files ending with `.bin` do not support graph visualization in qairt_visualizer. ## Demo @@ -226,3 +226,79 @@ python examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py -b build 2. Please ignore this if you are using `qnn_executor_runner`. If you have decided to write your own runner, please follow the [tutorial](https://pytorch.org/executorch/stable/etdump.html) on how to implement etdump into your own runner. 3. The current debugger does not support graph with partitions. (WIP) 4. The current debugger does not support LLM models. (WIP) + + +## ExecuTorch QNN HTP Heap Profiling + +Measures DSP memory usage when using context binary models on the HTP backend. + +### Introduction + +DSP heap profiling is available for `QnnContext_createFromBinary` use-cases. It captures total DSP heap usage at two checkpoints: + +- **Before the first context is created** (`before_context_created`) +- **After the last context is freed** (`after_context_freed`) + +The difference between the two values represents heap consumed during context execution. The value after freeing is typically equal to or greater than before creation. + +### Instructions + +#### Run the example test + +```bash +python backends/qualcomm/tests/test_qnn_delegate.py \ + TestQNNQuantizedUtils.test_qnn_backend_runtime_option_heap_profile \ + -b build-android -H ${HOST} -s ${SN} -m ${SOC_MODEL} +``` + +See [test_qnn_delegate.py](../tests/test_qnn_delegate.py) for the full test implementation. + +#### Setting + +```python +from executorch.backends.qualcomm.utils.utils import generate_htp_compiler_spec +from executorch.backends.qualcomm.utils.utils import generate_qnn_executorch_compiler_spec + +backend_options = generate_htp_compiler_spec( + use_multi_contexts=True, +) + +compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.chipset_table[TestQNN.soc_model], + backend_options=backend_options, + profile_level=2, +) + +# ... + +self.verify_output( + module, + sample_input, + exec_prog, + save_heap_result=True, +) +``` + +#### Output file format + +The result is written to a text file (default: `htp_heap_usage.txt`) with two lines: + +``` +DSP:before_context_created (bytes), +DSP:after_context_freed (bytes), +``` + +#### Reference result + +Measured on SM8850. A difference of 0 means no additional heap is consumed during context binary execution. + +```console +First value (before_context_created): 928212 bytes +Second value (after_context_freed): 928212 bytes +difference: 0.00 bytes +``` + +### Limitations + +1. Only supported HTP backend on Android and QNX platforms. +2. By enabling this feature, initialization and cleanup time might be impacted. diff --git a/backends/qualcomm/export_utils.py b/backends/qualcomm/export_utils.py index 313573e523a..9b3703357c3 100644 --- a/backends/qualcomm/export_utils.py +++ b/backends/qualcomm/export_utils.py @@ -494,6 +494,11 @@ def pull_debug_output(self, etdump_path, debug_ouput_path, callback=None): if callback: callback() + def pull_heap_output(self, src_file_path, dst_folder, callback=None): + self._adb(["pull", src_file_path, dst_folder]) + if callback: + callback() + def build_executorch_binary( model: torch.nn.Module, # noqa: B006 diff --git a/backends/qualcomm/runtime/QnnBackendOptions.cpp b/backends/qualcomm/runtime/QnnBackendOptions.cpp index 0eb678b45e2..2117932bddc 100644 --- a/backends/qualcomm/runtime/QnnBackendOptions.cpp +++ b/backends/qualcomm/runtime/QnnBackendOptions.cpp @@ -52,6 +52,14 @@ template QnnExecuTorchProfileLevel get_option( QnnExecuTorchProfileLevel, const char*); +executorch::runtime::Error get_runtime_option( + const char* key, + executorch::runtime::BackendOption& backend_option) { + std::strncpy(backend_option.key, key, runtime::kMaxOptionKeyLength); + backend_option.key[runtime::kMaxOptionKeyLength - 1] = '\0'; + return get_option(QNN_BACKEND, backend_option); +} + } // namespace qnn } // namespace backends } // namespace executorch diff --git a/backends/qualcomm/runtime/QnnBackendOptions.h b/backends/qualcomm/runtime/QnnBackendOptions.h index c366755edd0..93e0de1fb61 100644 --- a/backends/qualcomm/runtime/QnnBackendOptions.h +++ b/backends/qualcomm/runtime/QnnBackendOptions.h @@ -37,6 +37,19 @@ struct RuntimeOption { template T get_option(T aot_option, const char* aot_key); +/** + * @brief + * Get the backend option. + * This method checks runtime option only. + * + * @param key The key of runtime option. + * @param backend_option The backend_option to be restored in runtime. + */ + +executorch::runtime::Error get_runtime_option( + const char* key, + executorch::runtime::BackendOption& backend_option); + } // namespace qnn } // namespace backends } // namespace executorch diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h index 8a0ee3fed4b..9699e5b4735 100644 --- a/backends/qualcomm/runtime/QnnExecuTorch.h +++ b/backends/qualcomm/runtime/QnnExecuTorch.h @@ -25,6 +25,7 @@ #define QNN_RUNTIME_LPAI_CLIENT_PERF_TYPE "qnn_runtime_lpai_client_perf_type" #define QNN_RUNTIME_LPAI_AFFINITY "qnn_runtime_lpai_affinity" #define QNN_RUNTIME_LPAI_CORE_SELECTION "qnn_runtime_lpai_core_selection" +#define QNN_RUNTIME_HEAP_PROFILING_PATH "qnn_runtime_heap_profiling_path" #ifdef __cplusplus extern "C" { diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp index 33cca5350d9..b47fa42b268 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp @@ -245,6 +245,13 @@ executorch::runtime::Error QnnExecuTorchBackend::set_option( qnn_runtime_lpai_core_selection_.value = *val; qnn_runtime_lpai_core_selection_.is_set = true; } + } else if (strcmp(option.key, QNN_RUNTIME_HEAP_PROFILING_PATH) == 0) { + if (auto* val = + std::get_if>( + &option.value)) { + qnn_runtime_heap_profiling_path_.value = *val; + qnn_runtime_heap_profiling_path_.is_set = true; + } } else { ET_LOG( Error, @@ -303,6 +310,10 @@ executorch::runtime::Error QnnExecuTorchBackend::get_option( strcmp(backend_options[i].key, QNN_RUNTIME_LPAI_CORE_SELECTION) == 0 && qnn_runtime_lpai_core_selection_.is_set) { backend_options[i].value = qnn_runtime_lpai_core_selection_.value; + } else if ( + strcmp(backend_options[i].key, QNN_RUNTIME_HEAP_PROFILING_PATH) == 0 && + qnn_runtime_heap_profiling_path_.is_set) { + backend_options[i].value = qnn_runtime_heap_profiling_path_.value; } else { // either runtime never called set_option or key does not exist matches--; diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.h b/backends/qualcomm/runtime/QnnExecuTorchBackend.h index 942e61e2267..e3548c8752b 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.h +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.h @@ -71,6 +71,7 @@ class QnnExecuTorchBackend final RuntimeOption qnn_runtime_lpai_client_perf_type_{false, 0}; RuntimeOption qnn_runtime_lpai_affinity_{false, 0}; RuntimeOption qnn_runtime_lpai_core_selection_{false, 0}; + RuntimeOption qnn_runtime_heap_profiling_path_{false, {}}; }; } // namespace qnn diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index fa2008befd5..4e819a43121 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -71,7 +71,8 @@ std::unique_ptr QnnBackendFactory::Create( qnn_device_ptr, backend_params->qnn_backend_cache_ptr_.get(), htp_options, - qnn_dlc_manager); + qnn_dlc_manager, + get_option(options->profile_level(), QNN_RUNTIME_PROFILE_LEVEL)); backend_params->qnn_graph_ptr_ = std::make_unique( implementation_ptr, diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp index e16a173db6c..e81f92a8003 100644 --- a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp +++ b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include @@ -13,12 +14,46 @@ namespace executorch { namespace backends { namespace qnn { +std::mutex QnnContext::htp_context_mutex_; +int QnnContext::htp_context_count_{0}; + +void QnnContext::WriteHeapProfile() { + executorch::runtime::BackendOption backend_option; + std::string heap_profiling_path; + if (get_runtime_option(QNN_RUNTIME_HEAP_PROFILING_PATH, backend_option) == + Error::Ok) { + auto* arr = std::get_if>( + &backend_option.value); + if (arr) { + heap_profiling_path = arr->data(); + } + } + Qnn_ErrorHandle_t error_profile = + qnn_profiler_->ProfileDataToFile(heap_profiling_path); + if (error_profile != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_ERROR( + "Failed to profile. Cannot get profile from handle. Error %d", + QNN_GET_ERROR_CODE(error_profile)); + } +} + QnnContext::~QnnContext() { const QnnInterface& qnn_interface = implementation_->GetQnnInterface(); Qnn_ErrorHandle_t error = QNN_SUCCESS; + if (handle_ != nullptr) { QNN_EXECUTORCH_LOG_INFO("Destroy Qnn context"); - error = qnn_interface.qnn_context_free(handle_, /*profile=*/nullptr); + + bool do_heap_profile = false; + { + std::lock_guard lock(htp_context_mutex_); + if (is_htp_backend_ && htp_context_count_ > 0 && need_to_profile_) { + --htp_context_count_; + do_heap_profile = (htp_context_count_ == 0); + } + } + error = qnn_interface.qnn_context_free( + handle_, do_heap_profile ? qnn_profiler_->GetHandle() : nullptr); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( "Failed to free QNN " @@ -26,6 +61,8 @@ QnnContext::~QnnContext() { "ID %u, error %d", qnn_interface.GetBackendId(), QNN_GET_ERROR_CODE(error)); + } else if (do_heap_profile) { + WriteHeapProfile(); } handle_ = nullptr; } @@ -45,21 +82,51 @@ Error QnnContext::Configure() { if (cache_->GetCacheState() == QnnBackendCache::DESERIALIZE) { const QnnExecuTorchContextBinary& qnn_context_blob = cache_->GetQnnContextBlob(); + /* + Total DSP heap usage can be measured in two conditions, first context + creation and last context free. By the QNN documentation, we need to insert + profileHandle in qnn_context_create_from_binary when creating first context + and closing last context. + + Limitations are two: + 1.Only supported on Android and QNX platforms. + 2.By enabling this feature initialization and cleanup time might be + impacted. + */ + + bool do_heap_profile = false; + { + std::lock_guard lock(htp_context_mutex_); + do_heap_profile = + is_htp_backend_ && (htp_context_count_ == 0) && need_to_profile_; + if (is_htp_backend_) { + ++htp_context_count_; + } + } error = qnn_interface.qnn_context_create_from_binary( backend_->GetHandle(), device_->GetHandle(), - temp_context_config.empty() ? nullptr : temp_context_config.data(), + (temp_context_config.empty() ? nullptr : temp_context_config.data()), static_cast(qnn_context_blob.buffer), qnn_context_blob.nbytes, &handle_, - /*profile=*/nullptr); + do_heap_profile ? qnn_profiler_->GetHandle() : nullptr); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( "Can't create context from " "binary. Error %d.", QNN_GET_ERROR_CODE(error)); + // Rollback the count since context creation failed + { + std::lock_guard lock(htp_context_mutex_); + if (is_htp_backend_ && htp_context_count_ > 0) { + --htp_context_count_; + } + } return Error::Internal; + } else if (do_heap_profile) { + WriteHeapProfile(); } } else if ( cache_->GetCacheState() == QnnBackendCache::SERIALIZE || diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h index 7d507a4a50c..369728bb904 100644 --- a/backends/qualcomm/runtime/backends/QnnContextCommon.h +++ b/backends/qualcomm/runtime/backends/QnnContextCommon.h @@ -13,7 +13,10 @@ #include #include +#include + #include +#include namespace executorch { namespace backends { @@ -28,13 +31,23 @@ class QnnContext { QnnBackend* backend, QnnDevice* device, QnnBackendCache* cache, - QnnDlcManager* qnn_dlc_manager) + QnnDlcManager* qnn_dlc_manager, + const QnnExecuTorchProfileLevel& profile_level) : handle_(nullptr), implementation_(implementation), backend_(backend), device_(device), cache_(cache), - qnn_dlc_manager_(qnn_dlc_manager) {} + qnn_dlc_manager_(qnn_dlc_manager), + profile_level_(profile_level), + is_htp_backend_( + implementation->GetQnnInterface().GetBackendId() == + QNN_BACKEND_ID_HTP), + need_to_profile_( + profile_level != QnnExecuTorchProfileLevel::kProfileOff) { + qnn_profiler_ = + std::make_unique(implementation_, backend_, profile_level_); + } virtual ~QnnContext(); @@ -73,6 +86,7 @@ class QnnContext { }; private: + void WriteHeapProfile(); Qnn_ContextHandle_t handle_; QnnImplementation* implementation_; QnnBackend* backend_; @@ -80,6 +94,13 @@ class QnnContext { QnnBackendCache* cache_; QnnContextCustomProtocol qnn_context_custom_protocol_; QnnDlcManager* qnn_dlc_manager_; + + QnnExecuTorchProfileLevel profile_level_; + std::unique_ptr qnn_profiler_; + bool is_htp_backend_; + bool need_to_profile_; + static std::mutex htp_context_mutex_; + static int htp_context_count_; }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.cpp b/backends/qualcomm/runtime/backends/QnnProfiler.cpp index b4650b30796..177e3761afd 100644 --- a/backends/qualcomm/runtime/backends/QnnProfiler.cpp +++ b/backends/qualcomm/runtime/backends/QnnProfiler.cpp @@ -8,10 +8,34 @@ #include +#include +#include + namespace executorch { namespace backends { namespace qnn { +#define DEFINE_HEAP_BEFORE_CREATION "DSP:before_context_created" +#define DEFINE_HEAP_AFTER_FREED "DSP:after_context_freed" + +namespace { +const char* get_event_unit(QnnProfile_EventUnit_t unit) { + switch (unit) { + case QNN_PROFILE_EVENTUNIT_MICROSEC: + return " (us)"; + case QNN_PROFILE_EVENTUNIT_BYTES: + return " (bytes)"; + case QNN_PROFILE_EVENTUNIT_COUNT: + return " (count)"; + case QNN_PROFILE_EVENTUNIT_BACKEND: + // cycle unit is default appeared + case QNN_PROFILE_EVENTUNIT_CYCLES: + default: + return ""; + } +} +} // namespace + QnnProfile::QnnProfile( QnnImplementation* implementation, QnnBackend* backend, @@ -71,36 +95,36 @@ QnnProfile::QnnProfile( } } +Qnn_ErrorHandle_t QnnProfile::FetchEvents( + const QnnProfile_EventId_t** events_ptr, + std::uint32_t* num_events) { + if (handle_ == nullptr) { + QNN_EXECUTORCH_LOG_WARN("Profile handle is null, skipping FetchEvents"); + *num_events = 0; + return QNN_SUCCESS; + } + const QnnInterface& qnn_interface = implementation_->GetQnnInterface(); + Qnn_ErrorHandle_t error = + qnn_interface.qnn_profile_get_events(handle_, events_ptr, num_events); + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_ERROR( + "Failed to get profile events: %d", QNN_GET_ERROR_CODE(error)); + } + return error; +} + Qnn_ErrorHandle_t QnnProfile::ProfileData( executorch::runtime::EventTracer* event_tracer) { - const QnnInterface& qnn_interface = implementation_->GetQnnInterface(); const QnnProfile_EventId_t* events_ptr = nullptr; - const QnnProfile_EventId_t* sub_events_ptr = nullptr; std::uint32_t num_events = 0; - std::uint32_t num_sub_events = 0; - Qnn_ErrorHandle_t error = - qnn_interface.qnn_profile_get_events(handle_, &events_ptr, &num_events); + Qnn_ErrorHandle_t error = FetchEvents(&events_ptr, &num_events); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( - "ProfileData failed to get events: %d", QNN_GET_ERROR_CODE(error)); + "Failed to profile data in function FetchEvents: %d", + QNN_GET_ERROR_CODE(error)); return error; } - - auto get_unit = [](QnnProfile_EventUnit_t unit) { - switch (unit) { - case QNN_PROFILE_EVENTUNIT_MICROSEC: - return " (us)"; - case QNN_PROFILE_EVENTUNIT_BYTES: - return " (bytes)"; - case QNN_PROFILE_EVENTUNIT_COUNT: - return " (count)"; - case QNN_PROFILE_EVENTUNIT_BACKEND: - // cycle unit is default appeared - case QNN_PROFILE_EVENTUNIT_CYCLES: - default: - return ""; - } - }; + const QnnInterface& qnn_interface = implementation_->GetQnnInterface(); QnnProfile_EventData_t event_data; for (std::uint32_t i = 0; i < num_events; ++i) { error = @@ -115,7 +139,7 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData( } // add events for other important metrics, e.g. RPC execution time std::string identifier = - std::string(event_data.identifier) + get_unit(event_data.unit); + std::string(event_data.identifier) + get_event_unit(event_data.unit); executorch::runtime::event_tracer_log_profiling_delegate( event_tracer, identifier.c_str(), @@ -125,48 +149,114 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData( event_data.value); // Check an event's sub events only if it relates to graph execution time // (and its sub events are the individual op executions): - if (backend_->IsProfileEventTypeParentOfNodeTime(event_data.type)) { - error = qnn_interface.qnn_profile_get_sub_events( - events_ptr[i], &sub_events_ptr, &num_sub_events); + if (!backend_->IsProfileEventTypeParentOfNodeTime(event_data.type)) { + continue; + } + const QnnProfile_EventId_t* sub_events_ptr = nullptr; + std::uint32_t num_sub_events = 0; + error = qnn_interface.qnn_profile_get_sub_events( + events_ptr[i], &sub_events_ptr, &num_sub_events); + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_ERROR( + "ProfileData failed to get sub events " + "for event %d: %d", + i, + QNN_GET_ERROR_CODE(error)); + return error; + } + + QnnProfile_EventData_t sub_event_data; + for (std::uint32_t j = 0; j < num_sub_events; ++j) { + error = qnn_interface.qnn_profile_get_event_data( + sub_events_ptr[j], &sub_event_data); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( - "ProfileData failed to get sub events " - "for event %d: %d", + "ProfileData failed to get sub " + "event data for sub event %d of event %d: %d", + j, i, QNN_GET_ERROR_CODE(error)); return error; } - - QnnProfile_EventData_t sub_event_data; - for (std::uint32_t j = 0; j < num_sub_events; ++j) { - error = qnn_interface.qnn_profile_get_event_data( - sub_events_ptr[j], &sub_event_data); - if (error != QNN_SUCCESS) { - QNN_EXECUTORCH_LOG_ERROR( - "ProfileData failed to get sub " - "event data for sub event %d of event %d: %d", - j, - i, - QNN_GET_ERROR_CODE(error)); - return error; - } - if (sub_event_data.type == QNN_PROFILE_EVENTTYPE_NODE && - (sub_event_data.unit == QNN_PROFILE_EVENTUNIT_MICROSEC || - sub_event_data.unit == QNN_PROFILE_EVENTUNIT_CYCLES)) { - executorch::runtime::event_tracer_log_profiling_delegate( - event_tracer, - sub_event_data.identifier, - /*delegate_debug_id=*/ - static_cast(-1), - 0, - sub_event_data.value); - } + if (sub_event_data.type == QNN_PROFILE_EVENTTYPE_NODE && + (sub_event_data.unit == QNN_PROFILE_EVENTUNIT_MICROSEC || + sub_event_data.unit == QNN_PROFILE_EVENTUNIT_CYCLES)) { + executorch::runtime::event_tracer_log_profiling_delegate( + event_tracer, + sub_event_data.identifier, + /*delegate_debug_id=*/ + static_cast(-1), + 0, + sub_event_data.value); } } } return error; } +Qnn_ErrorHandle_t QnnProfile::ProfileDataToFile( + const std::string profile_filename) { + if (handle_ == nullptr) { + QNN_EXECUTORCH_LOG_WARN( + "Profile handle is null, skipping ProfileDataToFile"); + return QNN_SUCCESS; + } + if (profile_filename.empty()) { + QNN_EXECUTORCH_LOG_WARN( + "Heap profiling path is empty. Please provide profiling filename from runtime option."); + return QNN_SUCCESS; + } + const QnnProfile_EventId_t* events_ptr = nullptr; + std::uint32_t num_events = 0; + Qnn_ErrorHandle_t error = FetchEvents(&events_ptr, &num_events); + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_ERROR( + "Failed to profile data in function FetchEvents: %d", + QNN_GET_ERROR_CODE(error)); + return error; + } + const QnnInterface& qnn_interface = implementation_->GetQnnInterface(); + QnnProfile_EventData_t event_data; + std::uint32_t count_num_events = 0; + for (std::uint32_t i = 0; i < num_events; ++i) { + error = + qnn_interface.qnn_profile_get_event_data(events_ptr[i], &event_data); + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_ERROR( + "ProfileData failed to get event data " + "for event %d: %d", + i, + QNN_GET_ERROR_CODE(error)); + return error; + } + + std::ios_base::openmode open_mode = std::ios::app; + if (strcmp(event_data.identifier, DEFINE_HEAP_BEFORE_CREATION) == 0) { + open_mode = std::ios::trunc; + } else if (strcmp(event_data.identifier, DEFINE_HEAP_AFTER_FREED) == 0) { + open_mode = std::ios::app; + } else { + count_num_events++; + continue; + } + std::string identifier = + std::string(event_data.identifier) + get_event_unit(event_data.unit); + std::ofstream ofs(profile_filename, open_mode); + if (!ofs) { + QNN_EXECUTORCH_LOG_ERROR( + "Error when opening profile file: %s", profile_filename.c_str()); + return QNN_COMMON_ERROR_GENERAL; + } + ofs << identifier << ", " << event_data.value << "\n"; + } + if (count_num_events == num_events) { + QNN_EXECUTORCH_LOG_WARN( + "Not HTP backend but enable htp profiling. Please check setting."); + return QNN_SUCCESS; + } + return error; +} + QnnProfile::~QnnProfile() { const QnnInterface& qnn_interface = implementation_->GetQnnInterface(); if (handle_ != nullptr) { diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.h b/backends/qualcomm/runtime/backends/QnnProfiler.h index de8fbd1d9d5..971738a28d6 100644 --- a/backends/qualcomm/runtime/backends/QnnProfiler.h +++ b/backends/qualcomm/runtime/backends/QnnProfiler.h @@ -12,6 +12,9 @@ #include #include #include "QnnProfile.h" + +#include + namespace executorch { namespace backends { namespace qnn { @@ -24,6 +27,7 @@ class QnnProfile { const QnnExecuTorchProfileLevel& profile_level); ~QnnProfile(); Qnn_ErrorHandle_t ProfileData(executorch::runtime::EventTracer* event_tracer); + Qnn_ErrorHandle_t ProfileDataToFile(const std::string profile_filename); Qnn_ProfileHandle_t GetHandle() { return handle_; @@ -33,6 +37,10 @@ class QnnProfile { Qnn_ProfileHandle_t handle_; QnnImplementation* implementation_; QnnBackend* backend_; + + Qnn_ErrorHandle_t FetchEvents( + const QnnProfile_EventId_t** events_ptr, + std::uint32_t* num_events); }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/runtime/backends/gpu/GpuContext.cpp b/backends/qualcomm/runtime/backends/gpu/GpuContext.cpp index 07952e77eef..c6c6ace2bdf 100644 --- a/backends/qualcomm/runtime/backends/gpu/GpuContext.cpp +++ b/backends/qualcomm/runtime/backends/gpu/GpuContext.cpp @@ -21,7 +21,13 @@ GpuContext::GpuContext( QnnBackendCache* cache, QnnDlcManager* qnn_dlc_manager, const QnnExecuTorchGpuBackendOptions* gpu_options) - : QnnContext(implementation, backend, device, cache, qnn_dlc_manager) { + : QnnContext( + implementation, + backend, + device, + cache, + qnn_dlc_manager, + QnnExecuTorchProfileLevel::kProfileOff) { gpu_context_custom_config_ = std::make_unique(gpu_options); } diff --git a/backends/qualcomm/runtime/backends/htp/HtpContext.h b/backends/qualcomm/runtime/backends/htp/HtpContext.h index a0389ea5983..f00b709f607 100644 --- a/backends/qualcomm/runtime/backends/htp/HtpContext.h +++ b/backends/qualcomm/runtime/backends/htp/HtpContext.h @@ -25,10 +25,17 @@ class HtpContext : public QnnContext { QnnDevice* device, QnnBackendCache* cache, const QnnExecuTorchHtpBackendOptions* htp_options, - QnnDlcManager* qnn_dlc_manager) - : QnnContext(implementation, backend, device, cache, qnn_dlc_manager) { - htp_context_custom_config_ = - std::make_unique(this, htp_options); + QnnDlcManager* qnn_dlc_manager, + const QnnExecuTorchProfileLevel& profile_level) + : QnnContext( + implementation, + backend, + device, + cache, + qnn_dlc_manager, + profile_level) { + htp_context_custom_config_ = std::make_unique( + this, htp_options, profile_level); } ~HtpContext() {} diff --git a/backends/qualcomm/runtime/backends/htp/HtpContextCustomConfig.h b/backends/qualcomm/runtime/backends/htp/HtpContextCustomConfig.h index f0d4873b0d2..64cb279a1c7 100644 --- a/backends/qualcomm/runtime/backends/htp/HtpContextCustomConfig.h +++ b/backends/qualcomm/runtime/backends/htp/HtpContextCustomConfig.h @@ -26,12 +26,16 @@ class HtpContextCustomConfig { public: explicit HtpContextCustomConfig( const QnnContext* context, - const QnnExecuTorchHtpBackendOptions* htp_options) - : context_(context), htp_options_(htp_options) {} + const QnnExecuTorchHtpBackendOptions* htp_options, + const QnnExecuTorchProfileLevel& profile_level) + : profile_level_(profile_level), + context_(context), + htp_options_(htp_options) {} std::vector CreateContextCustomConfig(); private: + [[maybe_unused]] QnnExecuTorchProfileLevel profile_level_; QnnHtpContext_CustomConfig_t* AllocContextCustomConfig() { htp_context_config_.emplace_back( std::make_unique()); diff --git a/backends/qualcomm/runtime/backends/htp/host/HtpContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/htp/host/HtpContextCustomConfig.cpp index 4850afa14a2..037998132a8 100644 --- a/backends/qualcomm/runtime/backends/htp/host/HtpContextCustomConfig.cpp +++ b/backends/qualcomm/runtime/backends/htp/host/HtpContextCustomConfig.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include namespace executorch { diff --git a/backends/qualcomm/runtime/backends/htp/target/HtpContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/htp/target/HtpContextCustomConfig.cpp index 676795797f8..8488bf21e79 100644 --- a/backends/qualcomm/runtime/backends/htp/target/HtpContextCustomConfig.cpp +++ b/backends/qualcomm/runtime/backends/htp/target/HtpContextCustomConfig.cpp @@ -19,6 +19,17 @@ HtpContextCustomConfig::CreateContextCustomConfig() { QnnHtpContext_CustomConfig_t* p_custom_config = nullptr; const HtpContext* htp_ctx = static_cast(context_); + // TODO: Verify heap profile works with kProfileBasic once enabled. + if (profile_level_ != QnnExecuTorchProfileLevel::kProfileOff) { + QnnHtpContext_CustomConfig_t* p_custom_config_profile = nullptr; + p_custom_config_profile = AllocContextCustomConfig(); + p_custom_config_profile->option = + QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED; + p_custom_config_profile->dspMemoryProfilingEnabled = true; + ret.push_back( + static_cast(p_custom_config_profile)); + } + if (htp_options_->use_multi_contexts() && htp_options_->max_sf_buf_size() != 0) { p_custom_config = AllocContextCustomConfig(); diff --git a/backends/qualcomm/runtime/backends/ir/host/QnnDlcManager.cpp b/backends/qualcomm/runtime/backends/ir/host/QnnDlcManager.cpp index 47d583b5c15..62d01c78706 100644 --- a/backends/qualcomm/runtime/backends/ir/host/QnnDlcManager.cpp +++ b/backends/qualcomm/runtime/backends/ir/host/QnnDlcManager.cpp @@ -47,7 +47,8 @@ Error QnnDlcManager::Create() { backend_bundle_ptr_->qnn_backend_ptr.get(), backend_bundle_ptr_->qnn_device_ptr.get(), backend_params_ptr_->qnn_backend_cache_ptr_.get(), - nullptr); + nullptr, + QnnExecuTorchProfileLevel::kProfileOff); backend_params_ptr_->qnn_graph_ptr_ = std::make_unique( backend_bundle_ptr_->implementation.get(), diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp b/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp index d5203898f6b..e0c9d3ed3d8 100644 --- a/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp +++ b/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp @@ -20,7 +20,13 @@ LpaiContext::LpaiContext( QnnDevice* device, QnnBackendCache* cache, QnnDlcManager* qnn_dlc_manager) - : QnnContext(implementation, backend, device, cache, qnn_dlc_manager) { + : QnnContext( + implementation, + backend, + device, + cache, + qnn_dlc_manager, + QnnExecuTorchProfileLevel::kProfileOff) { lpai_context_custom_config_ = std::make_unique(); } diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index d76e3ea1df7..c44ac6efb59 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -5528,6 +5528,42 @@ def test_qnn_backend_profile_op(self): ) TestQNN.profile_level = 0 + def test_qnn_backend_runtime_option_heap_profile(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + backend_options = generate_htp_compiler_spec( + use_fp16=True, + use_multi_contexts=True, + ) + + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.chipset_table[TestQNN.soc_model], + backend_options=backend_options, + profile_level=2, # if 0 for closing heap profiling + ) + + pass_jobs = get_capture_program_passes() + split_graph_pass, setting = self.split_graph(4) + pass_jobs[split_graph_pass] = setting + dep_table = get_passes_dependency_for_capture_program() + dep_table[split_graph_pass] = [FoldQDQ] + + edge_prog_mgr = to_edge_transform_and_lower_to_qnn( + module=module, + inputs=sample_input, + compiler_specs=compiler_specs, + dep_table=dep_table, + passes_job=pass_jobs, + ) + exec_prog = edge_prog_mgr.to_executorch() + self.verify_output( + module, + sample_input, + exec_prog, + save_heap_result=True, + ) + def test_qnn_backend_runtime_option_htp_performance(self): backend_options = generate_htp_compiler_spec(use_fp16=True) TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( @@ -6432,6 +6468,43 @@ def test_qnn_backend_profile_op(self): ) TestQNN.profile_level = 0 + def test_qnn_backend_runtime_option_heap_profile(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + module1 = self.get_qdq_module(module, sample_input) + + backend_options = generate_htp_compiler_spec( + use_fp16=False, + use_multi_contexts=True, + ) + + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.chipset_table[TestQNN.soc_model], + backend_options=backend_options, + profile_level=2, # if 0 for closing heap profiling + ) + + pass_jobs = get_capture_program_passes() + split_graph_pass, setting = self.split_graph(4) + pass_jobs[split_graph_pass] = setting + dep_table = get_passes_dependency_for_capture_program() + dep_table[split_graph_pass] = [FoldQDQ] + + edge_prog_mgr = to_edge_transform_and_lower_to_qnn( + module=module1, + inputs=sample_input, + compiler_specs=compiler_specs, + dep_table=dep_table, + passes_job=pass_jobs, + ) + exec_prog = edge_prog_mgr.to_executorch() + self.verify_output( + module1, + sample_input, + exec_prog, + save_heap_result=True, + ) + def test_qnn_backend_runtime_option_htp_performance(self): backend_options = generate_htp_compiler_spec(use_fp16=False) TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index 93a6dd81f73..1bd0ac1d4c5 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -8,6 +8,7 @@ import subprocess import tempfile import unittest +from pathlib import Path from typing import Callable, Dict, List, Optional, OrderedDict, Tuple import numpy as np @@ -191,6 +192,7 @@ class TestQNN(unittest.TestCase): inference_speed_output_path = "outputs/inference_speed.txt" static_llm_eval_method = "" direct_build_folder: str = "" + dsp_heap_profile_filename = "htp_heap_usage.txt" @classmethod def setUpClass(cls): @@ -332,6 +334,7 @@ def verify_output( # noqa: C901 save_inference_speed: bool = False, expected_compared_events: int = -1, qnn_intermediate_debugger: QNNIntermediateDebugger = None, + save_heap_result: bool = False, ): with tempfile.TemporaryDirectory() as tmp_dir: ( @@ -385,6 +388,23 @@ def validate_profile(): len(inspector.to_dataframe().index) >= expected_profile_events ) + def validate_heap_profile(): + file_path = f"{tmp_dir}/{self.dsp_heap_profile_filename}" + self.assertTrue( + Path(file_path).exists(), f"File not found: {file_path}" + ) + with open(file_path, "r") as f: + values = [ + int(line.split(",")[1].strip()) for line in f if line.strip() + ] + self.assertEqual(len(values), 2, f"Expected 2 entries, got {values}") + before, after = values + difference = after - before + + print(f"before_context_created: {before} bytes") + print(f"after_context_freed: {after} bytes") + print(f"difference: {difference:.2f} bytes") + def validate_intermediate_tensor(): inspector = Inspector( etdump_path=etdump_path, debug_buffer_path=debug_output_path @@ -547,6 +567,11 @@ def validate_intermediate_tensor(): adb.extra_cmds += ( f" --performance_output_path {self.inference_speed_output_path}" ) + + if save_heap_result: + adb.extra_cmds += ( + f" --heap_profiling_path {self.dsp_heap_profile_filename}" + ) adb.execute(custom_runner_cmd=f"rm -rf {adb.output_folder}") adb.execute(method_index=method_index, output_callback=output_callback) adb.pull(host_output_path=tmp_dir, callback=post_process) @@ -566,6 +591,12 @@ def validate_intermediate_tensor(): f"{tmp_dir}/{self.inference_speed_output_path}", "r" ) as f: self.inference_speed = float(f.read()) + if save_heap_result: + adb.pull_heap_output( + f"{adb.workspace}/{self.dsp_heap_profile_filename}", + f"{tmp_dir}/{self.dsp_heap_profile_filename}", + callback=validate_heap_profile, + ) def lower_module_and_test_output( self,