diff --git a/backends/qualcomm/debugger/README.md b/backends/qualcomm/debugger/README.md
index fb8f9a1c662..f383130b068 100644
--- a/backends/qualcomm/debugger/README.md
+++ b/backends/qualcomm/debugger/README.md
@@ -50,7 +50,7 @@ Generate optrace and QHAS files using QNN tools under $QNN_SDK_ROOT. After finis
 adb = SimpleADB(
     qnn_config=qnn_config,
     pte_path=f"{args.artifact}/{pte_filename}.pte",
-    workspace=f"/data/local/tmp/executorch/{pte_filename},
+    workspace=f"/data/local/tmp/executorch/{pte_filename}",
 )
 binaries_trace = generate_optrace(
     args, adb, f"{args.artifact}/{pte_filename}.pte", example_input
@@ -78,7 +78,7 @@ qairt_visualizer.view(reports=[optrace, qhas])
 - `model`: Path to your QNN model file (e.g., `path_to_your_model.dlc`).
 - **`reports`**: List of report file paths, including the optrace (`optrace.json`) and QHAS (`optrace_qnn_htp_analysis_summary.json`).
 
-Note: Files ending with `.bin ` do not support graph visualization in qairt_visualizer.
+Note: Files ending with `.bin` do not support graph visualization in qairt_visualizer.
 
 ## Demo
 
@@ -226,3 +226,79 @@ python examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py -b build
 2. Please ignore this if you are using `qnn_executor_runner`. If you have decided to write your own runner, please follow the [tutorial](https://pytorch.org/executorch/stable/etdump.html) on how to implement etdump into your own runner.
 3. The current debugger does not support graph with partitions. (WIP)
 4. The current debugger does not support LLM models. (WIP)
+
+
+## ExecuTorch QNN HTP Heap Profiling
+
+Measures DSP memory usage when using context binary models on the HTP backend.
+
+### Introduction
+
+DSP heap profiling is available for `QnnContext_createFromBinary` use-cases. It captures total DSP heap usage at two checkpoints:
+
+- **Before the first context is created** (`before_context_created`)
+- **After the last context is freed** (`after_context_freed`)
+
+The difference between the two values represents heap consumed during context execution. The value after freeing is typically equal to or greater than before creation.
+
+### Instructions
+
+#### Run the example test
+
+```bash
+python backends/qualcomm/tests/test_qnn_delegate.py \
+    TestQNNQuantizedUtils.test_qnn_backend_runtime_option_heap_profile \
+    -b build-android -H ${HOST} -s ${SN} -m ${SOC_MODEL}
+```
+
+See [test_qnn_delegate.py](../tests/test_qnn_delegate.py) for the full test implementation.
+
+#### Setting
+
+```python
+from executorch.backends.qualcomm.utils.utils import generate_htp_compiler_spec
+from executorch.backends.qualcomm.utils.utils import generate_qnn_executorch_compiler_spec
+
+backend_options = generate_htp_compiler_spec(
+    use_multi_contexts=True,
+)
+
+compiler_specs = generate_qnn_executorch_compiler_spec(
+    soc_model=self.chipset_table[TestQNN.soc_model],
+    backend_options=backend_options,
+    profile_level=2,
+)
+
+# ...
+
+self.verify_output(
+    module,
+    sample_input,
+    exec_prog,
+    save_heap_result=True,
+)
+```
+
+#### Output file format
+
+The result is written to a text file (default: `htp_heap_usage.txt`) with two lines:
+
+```
+DSP:before_context_created (bytes), <value>
+DSP:after_context_freed (bytes), <value>
+```
+
+#### Reference result
+
+Measured on SM8850. A difference of 0 means no additional heap is consumed during context binary execution.
+
+```console
+First value (before_context_created): 928212 bytes
+Second value (after_context_freed): 928212 bytes
+difference: 0.00 bytes
+```
+
+### Limitations
+
+1. Only supported HTP backend on Android and QNX platforms.
+2. By enabling this feature, initialization and cleanup time might be impacted.
diff --git a/backends/qualcomm/export_utils.py b/backends/qualcomm/export_utils.py
index 313573e523a..9b3703357c3 100644
--- a/backends/qualcomm/export_utils.py
+++ b/backends/qualcomm/export_utils.py
@@ -494,6 +494,11 @@ def pull_debug_output(self, etdump_path, debug_ouput_path, callback=None):
         if callback:
             callback()
 
+    def pull_heap_output(self, src_file_path, dst_folder, callback=None):
+        self._adb(["pull", src_file_path, dst_folder])
+        if callback:
+            callback()
+
 
 def build_executorch_binary(
     model: torch.nn.Module,  # noqa: B006
diff --git a/backends/qualcomm/runtime/QnnBackendOptions.cpp b/backends/qualcomm/runtime/QnnBackendOptions.cpp
index 0eb678b45e2..2117932bddc 100644
--- a/backends/qualcomm/runtime/QnnBackendOptions.cpp
+++ b/backends/qualcomm/runtime/QnnBackendOptions.cpp
@@ -52,6 +52,14 @@ template QnnExecuTorchProfileLevel get_option<QnnExecuTorchProfileLevel>(
     QnnExecuTorchProfileLevel,
     const char*);
 
+executorch::runtime::Error get_runtime_option(
+    const char* key,
+    executorch::runtime::BackendOption& backend_option) {
+  std::strncpy(backend_option.key, key, runtime::kMaxOptionKeyLength);
+  backend_option.key[runtime::kMaxOptionKeyLength - 1] = '\0';
+  return get_option(QNN_BACKEND, backend_option);
+}
+
 } // namespace qnn
 } // namespace backends
 } // namespace executorch
diff --git a/backends/qualcomm/runtime/QnnBackendOptions.h b/backends/qualcomm/runtime/QnnBackendOptions.h
index c366755edd0..93e0de1fb61 100644
--- a/backends/qualcomm/runtime/QnnBackendOptions.h
+++ b/backends/qualcomm/runtime/QnnBackendOptions.h
@@ -37,6 +37,19 @@ struct RuntimeOption {
 template <typename T>
 T get_option(T aot_option, const char* aot_key);
 
+/**
+ * @brief
+ * Get the backend option.
+ * This method checks runtime option only.
+ *
+ * @param key The key of runtime option.
+ * @param backend_option The backend_option to be restored in runtime.
+ */
+
+executorch::runtime::Error get_runtime_option(
+    const char* key,
+    executorch::runtime::BackendOption& backend_option);
+
 } // namespace qnn
 } // namespace backends
 } // namespace executorch
diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h
index 8a0ee3fed4b..9699e5b4735 100644
--- a/backends/qualcomm/runtime/QnnExecuTorch.h
+++ b/backends/qualcomm/runtime/QnnExecuTorch.h
@@ -25,6 +25,7 @@
 #define QNN_RUNTIME_LPAI_CLIENT_PERF_TYPE "qnn_runtime_lpai_client_perf_type"
 #define QNN_RUNTIME_LPAI_AFFINITY "qnn_runtime_lpai_affinity"
 #define QNN_RUNTIME_LPAI_CORE_SELECTION "qnn_runtime_lpai_core_selection"
+#define QNN_RUNTIME_HEAP_PROFILING_PATH "qnn_runtime_heap_profiling_path"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index 33cca5350d9..b47fa42b268 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -245,6 +245,13 @@ executorch::runtime::Error QnnExecuTorchBackend::set_option(
         qnn_runtime_lpai_core_selection_.value = *val;
         qnn_runtime_lpai_core_selection_.is_set = true;
       }
+    } else if (strcmp(option.key, QNN_RUNTIME_HEAP_PROFILING_PATH) == 0) {
+      if (auto* val =
+              std::get_if<std::array<char, runtime::kMaxOptionValueLength>>(
+                  &option.value)) {
+        qnn_runtime_heap_profiling_path_.value = *val;
+        qnn_runtime_heap_profiling_path_.is_set = true;
+      }
     } else {
       ET_LOG(
           Error,
@@ -303,6 +310,10 @@ executorch::runtime::Error QnnExecuTorchBackend::get_option(
         strcmp(backend_options[i].key, QNN_RUNTIME_LPAI_CORE_SELECTION) == 0 &&
         qnn_runtime_lpai_core_selection_.is_set) {
       backend_options[i].value = qnn_runtime_lpai_core_selection_.value;
+    } else if (
+        strcmp(backend_options[i].key, QNN_RUNTIME_HEAP_PROFILING_PATH) == 0 &&
+        qnn_runtime_heap_profiling_path_.is_set) {
+      backend_options[i].value = qnn_runtime_heap_profiling_path_.value;
     } else {
       // either runtime never called set_option or key does not exist
       matches--;
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.h b/backends/qualcomm/runtime/QnnExecuTorchBackend.h
index 942e61e2267..e3548c8752b 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.h
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.h
@@ -71,6 +71,7 @@ class QnnExecuTorchBackend final
   RuntimeOption qnn_runtime_lpai_client_perf_type_{false, 0};
   RuntimeOption qnn_runtime_lpai_affinity_{false, 0};
   RuntimeOption qnn_runtime_lpai_core_selection_{false, 0};
+  RuntimeOption qnn_runtime_heap_profiling_path_{false, {}};
 };
 
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index fa2008befd5..4e819a43121 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -71,7 +71,8 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           qnn_device_ptr,
           backend_params->qnn_backend_cache_ptr_.get(),
           htp_options,
-          qnn_dlc_manager);
+          qnn_dlc_manager,
+          get_option(options->profile_level(), QNN_RUNTIME_PROFILE_LEVEL));
 
       backend_params->qnn_graph_ptr_ = std::make_unique<HtpGraph>(
           implementation_ptr,
diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp
index e16a173db6c..e81f92a8003 100644
--- a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp
+++ b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/qualcomm/runtime/QnnBackendOptions.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnDlcManager.h>
 
@@ -13,12 +14,46 @@ namespace executorch {
 namespace backends {
 namespace qnn {
 
+std::mutex QnnContext::htp_context_mutex_;
+int QnnContext::htp_context_count_{0};
+
+void QnnContext::WriteHeapProfile() {
+  executorch::runtime::BackendOption backend_option;
+  std::string heap_profiling_path;
+  if (get_runtime_option(QNN_RUNTIME_HEAP_PROFILING_PATH, backend_option) ==
+      Error::Ok) {
+    auto* arr = std::get_if<std::array<char, runtime::kMaxOptionValueLength>>(
+        &backend_option.value);
+    if (arr) {
+      heap_profiling_path = arr->data();
+    }
+  }
+  Qnn_ErrorHandle_t error_profile =
+      qnn_profiler_->ProfileDataToFile(heap_profiling_path);
+  if (error_profile != QNN_SUCCESS) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Failed to profile. Cannot get profile from handle. Error %d",
+        QNN_GET_ERROR_CODE(error_profile));
+  }
+}
+
 QnnContext::~QnnContext() {
   const QnnInterface& qnn_interface = implementation_->GetQnnInterface();
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
   if (handle_ != nullptr) {
     QNN_EXECUTORCH_LOG_INFO("Destroy Qnn context");
-    error = qnn_interface.qnn_context_free(handle_, /*profile=*/nullptr);
+
+    bool do_heap_profile = false;
+    {
+      std::lock_guard<std::mutex> lock(htp_context_mutex_);
+      if (is_htp_backend_ && htp_context_count_ > 0 && need_to_profile_) {
+        --htp_context_count_;
+        do_heap_profile = (htp_context_count_ == 0);
+      }
+    }
+    error = qnn_interface.qnn_context_free(
+        handle_, do_heap_profile ? qnn_profiler_->GetHandle() : nullptr);
     if (error != QNN_SUCCESS) {
       QNN_EXECUTORCH_LOG_ERROR(
           "Failed to free QNN "
@@ -26,6 +61,8 @@ QnnContext::~QnnContext() {
           "ID %u, error %d",
           qnn_interface.GetBackendId(),
           QNN_GET_ERROR_CODE(error));
+    } else if (do_heap_profile) {
+      WriteHeapProfile();
     }
     handle_ = nullptr;
   }
@@ -45,21 +82,51 @@ Error QnnContext::Configure() {
   if (cache_->GetCacheState() == QnnBackendCache::DESERIALIZE) {
     const QnnExecuTorchContextBinary& qnn_context_blob =
         cache_->GetQnnContextBlob();
+    /*
+    Total DSP heap usage can be measured in two conditions, first context
+    creation and last context free. By the QNN documentation, we need to insert
+    profileHandle in qnn_context_create_from_binary when creating first context
+    and closing last context.
+
+    Limitations are two:
+    1.Only supported on Android and QNX platforms.
+    2.By enabling this feature initialization and cleanup time might be
+    impacted.
+    */
+
+    bool do_heap_profile = false;
+    {
+      std::lock_guard<std::mutex> lock(htp_context_mutex_);
+      do_heap_profile =
+          is_htp_backend_ && (htp_context_count_ == 0) && need_to_profile_;
+      if (is_htp_backend_) {
+        ++htp_context_count_;
+      }
+    }
 
     error = qnn_interface.qnn_context_create_from_binary(
         backend_->GetHandle(),
         device_->GetHandle(),
-        temp_context_config.empty() ? nullptr : temp_context_config.data(),
+        (temp_context_config.empty() ? nullptr : temp_context_config.data()),
         static_cast<uint8_t*>(qnn_context_blob.buffer),
         qnn_context_blob.nbytes,
         &handle_,
-        /*profile=*/nullptr);
+        do_heap_profile ? qnn_profiler_->GetHandle() : nullptr);
     if (error != QNN_SUCCESS) {
       QNN_EXECUTORCH_LOG_ERROR(
           "Can't create context from "
           "binary. Error %d.",
           QNN_GET_ERROR_CODE(error));
+      // Rollback the count since context creation failed
+      {
+        std::lock_guard<std::mutex> lock(htp_context_mutex_);
+        if (is_htp_backend_ && htp_context_count_ > 0) {
+          --htp_context_count_;
+        }
+      }
       return Error::Internal;
+    } else if (do_heap_profile) {
+      WriteHeapProfile();
     }
   } else if (
       cache_->GetCacheState() == QnnBackendCache::SERIALIZE ||
diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h
index 7d507a4a50c..369728bb904 100644
--- a/backends/qualcomm/runtime/backends/QnnContextCommon.h
+++ b/backends/qualcomm/runtime/backends/QnnContextCommon.h
@@ -13,7 +13,10 @@
 #include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnDeviceCommon.h>
 
+#include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
+
 #include <memory>
+#include <mutex>
 
 namespace executorch {
 namespace backends {
@@ -28,13 +31,23 @@ class QnnContext {
       QnnBackend* backend,
       QnnDevice* device,
       QnnBackendCache* cache,
-      QnnDlcManager* qnn_dlc_manager)
+      QnnDlcManager* qnn_dlc_manager,
+      const QnnExecuTorchProfileLevel& profile_level)
       : handle_(nullptr),
         implementation_(implementation),
         backend_(backend),
         device_(device),
         cache_(cache),
-        qnn_dlc_manager_(qnn_dlc_manager) {}
+        qnn_dlc_manager_(qnn_dlc_manager),
+        profile_level_(profile_level),
+        is_htp_backend_(
+            implementation->GetQnnInterface().GetBackendId() ==
+            QNN_BACKEND_ID_HTP),
+        need_to_profile_(
+            profile_level != QnnExecuTorchProfileLevel::kProfileOff) {
+    qnn_profiler_ =
+        std::make_unique<QnnProfile>(implementation_, backend_, profile_level_);
+  }
 
   virtual ~QnnContext();
 
@@ -73,6 +86,7 @@ class QnnContext {
   };
 
  private:
+  void WriteHeapProfile();
   Qnn_ContextHandle_t handle_;
   QnnImplementation* implementation_;
   QnnBackend* backend_;
@@ -80,6 +94,13 @@ class QnnContext {
   QnnBackendCache* cache_;
   QnnContextCustomProtocol qnn_context_custom_protocol_;
   QnnDlcManager* qnn_dlc_manager_;
+
+  QnnExecuTorchProfileLevel profile_level_;
+  std::unique_ptr<QnnProfile> qnn_profiler_;
+  bool is_htp_backend_;
+  bool need_to_profile_;
+  static std::mutex htp_context_mutex_;
+  static int htp_context_count_;
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.cpp b/backends/qualcomm/runtime/backends/QnnProfiler.cpp
index b4650b30796..177e3761afd 100644
--- a/backends/qualcomm/runtime/backends/QnnProfiler.cpp
+++ b/backends/qualcomm/runtime/backends/QnnProfiler.cpp
@@ -8,10 +8,34 @@
 
 #include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
 
+#include <cinttypes>
+#include <fstream>
+
 namespace executorch {
 namespace backends {
 namespace qnn {
 
+#define DEFINE_HEAP_BEFORE_CREATION "DSP:before_context_created"
+#define DEFINE_HEAP_AFTER_FREED "DSP:after_context_freed"
+
+namespace {
+const char* get_event_unit(QnnProfile_EventUnit_t unit) {
+  switch (unit) {
+    case QNN_PROFILE_EVENTUNIT_MICROSEC:
+      return " (us)";
+    case QNN_PROFILE_EVENTUNIT_BYTES:
+      return " (bytes)";
+    case QNN_PROFILE_EVENTUNIT_COUNT:
+      return " (count)";
+    case QNN_PROFILE_EVENTUNIT_BACKEND:
+    // cycle unit is default appeared
+    case QNN_PROFILE_EVENTUNIT_CYCLES:
+    default:
+      return "";
+  }
+}
+} // namespace
+
 QnnProfile::QnnProfile(
     QnnImplementation* implementation,
     QnnBackend* backend,
@@ -71,36 +95,36 @@ QnnProfile::QnnProfile(
   }
 }
 
+Qnn_ErrorHandle_t QnnProfile::FetchEvents(
+    const QnnProfile_EventId_t** events_ptr,
+    std::uint32_t* num_events) {
+  if (handle_ == nullptr) {
+    QNN_EXECUTORCH_LOG_WARN("Profile handle is null, skipping FetchEvents");
+    *num_events = 0;
+    return QNN_SUCCESS;
+  }
+  const QnnInterface& qnn_interface = implementation_->GetQnnInterface();
+  Qnn_ErrorHandle_t error =
+      qnn_interface.qnn_profile_get_events(handle_, events_ptr, num_events);
+  if (error != QNN_SUCCESS) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Failed to get profile events: %d", QNN_GET_ERROR_CODE(error));
+  }
+  return error;
+}
+
 Qnn_ErrorHandle_t QnnProfile::ProfileData(
     executorch::runtime::EventTracer* event_tracer) {
-  const QnnInterface& qnn_interface = implementation_->GetQnnInterface();
   const QnnProfile_EventId_t* events_ptr = nullptr;
-  const QnnProfile_EventId_t* sub_events_ptr = nullptr;
   std::uint32_t num_events = 0;
-  std::uint32_t num_sub_events = 0;
-  Qnn_ErrorHandle_t error =
-      qnn_interface.qnn_profile_get_events(handle_, &events_ptr, &num_events);
+  Qnn_ErrorHandle_t error = FetchEvents(&events_ptr, &num_events);
   if (error != QNN_SUCCESS) {
     QNN_EXECUTORCH_LOG_ERROR(
-        "ProfileData failed to get events: %d", QNN_GET_ERROR_CODE(error));
+        "Failed to profile data in function FetchEvents: %d",
+        QNN_GET_ERROR_CODE(error));
     return error;
   }
-
-  auto get_unit = [](QnnProfile_EventUnit_t unit) {
-    switch (unit) {
-      case QNN_PROFILE_EVENTUNIT_MICROSEC:
-        return " (us)";
-      case QNN_PROFILE_EVENTUNIT_BYTES:
-        return " (bytes)";
-      case QNN_PROFILE_EVENTUNIT_COUNT:
-        return " (count)";
-      case QNN_PROFILE_EVENTUNIT_BACKEND:
-      // cycle unit is default appeared
-      case QNN_PROFILE_EVENTUNIT_CYCLES:
-      default:
-        return "";
-    }
-  };
+  const QnnInterface& qnn_interface = implementation_->GetQnnInterface();
   QnnProfile_EventData_t event_data;
   for (std::uint32_t i = 0; i < num_events; ++i) {
     error =
@@ -115,7 +139,7 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData(
     }
     // add events for other important metrics, e.g. RPC execution time
     std::string identifier =
-        std::string(event_data.identifier) + get_unit(event_data.unit);
+        std::string(event_data.identifier) + get_event_unit(event_data.unit);
     executorch::runtime::event_tracer_log_profiling_delegate(
         event_tracer,
         identifier.c_str(),
@@ -125,48 +149,114 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData(
         event_data.value);
     // Check an event's sub events only if it relates to graph execution time
     // (and its sub events are the individual op executions):
-    if (backend_->IsProfileEventTypeParentOfNodeTime(event_data.type)) {
-      error = qnn_interface.qnn_profile_get_sub_events(
-          events_ptr[i], &sub_events_ptr, &num_sub_events);
+    if (!backend_->IsProfileEventTypeParentOfNodeTime(event_data.type)) {
+      continue;
+    }
+    const QnnProfile_EventId_t* sub_events_ptr = nullptr;
+    std::uint32_t num_sub_events = 0;
+    error = qnn_interface.qnn_profile_get_sub_events(
+        events_ptr[i], &sub_events_ptr, &num_sub_events);
+    if (error != QNN_SUCCESS) {
+      QNN_EXECUTORCH_LOG_ERROR(
+          "ProfileData failed to get sub events "
+          "for event %d: %d",
+          i,
+          QNN_GET_ERROR_CODE(error));
+      return error;
+    }
+
+    QnnProfile_EventData_t sub_event_data;
+    for (std::uint32_t j = 0; j < num_sub_events; ++j) {
+      error = qnn_interface.qnn_profile_get_event_data(
+          sub_events_ptr[j], &sub_event_data);
       if (error != QNN_SUCCESS) {
         QNN_EXECUTORCH_LOG_ERROR(
-            "ProfileData failed to get sub events "
-            "for event %d: %d",
+            "ProfileData failed to get sub "
+            "event data for sub event %d of event %d: %d",
+            j,
             i,
             QNN_GET_ERROR_CODE(error));
         return error;
       }
-
-      QnnProfile_EventData_t sub_event_data;
-      for (std::uint32_t j = 0; j < num_sub_events; ++j) {
-        error = qnn_interface.qnn_profile_get_event_data(
-            sub_events_ptr[j], &sub_event_data);
-        if (error != QNN_SUCCESS) {
-          QNN_EXECUTORCH_LOG_ERROR(
-              "ProfileData failed to get sub "
-              "event data for sub event %d of event %d: %d",
-              j,
-              i,
-              QNN_GET_ERROR_CODE(error));
-          return error;
-        }
-        if (sub_event_data.type == QNN_PROFILE_EVENTTYPE_NODE &&
-            (sub_event_data.unit == QNN_PROFILE_EVENTUNIT_MICROSEC ||
-             sub_event_data.unit == QNN_PROFILE_EVENTUNIT_CYCLES)) {
-          executorch::runtime::event_tracer_log_profiling_delegate(
-              event_tracer,
-              sub_event_data.identifier,
-              /*delegate_debug_id=*/
-              static_cast<executorch::runtime::DebugHandle>(-1),
-              0,
-              sub_event_data.value);
-        }
+      if (sub_event_data.type == QNN_PROFILE_EVENTTYPE_NODE &&
+          (sub_event_data.unit == QNN_PROFILE_EVENTUNIT_MICROSEC ||
+           sub_event_data.unit == QNN_PROFILE_EVENTUNIT_CYCLES)) {
+        executorch::runtime::event_tracer_log_profiling_delegate(
+            event_tracer,
+            sub_event_data.identifier,
+            /*delegate_debug_id=*/
+            static_cast<executorch::runtime::DebugHandle>(-1),
+            0,
+            sub_event_data.value);
       }
     }
   }
   return error;
 }
 
+Qnn_ErrorHandle_t QnnProfile::ProfileDataToFile(
+    const std::string profile_filename) {
+  if (handle_ == nullptr) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Profile handle is null, skipping ProfileDataToFile");
+    return QNN_SUCCESS;
+  }
+  if (profile_filename.empty()) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Heap profiling path is empty. Please provide profiling filename from runtime option.");
+    return QNN_SUCCESS;
+  }
+  const QnnProfile_EventId_t* events_ptr = nullptr;
+  std::uint32_t num_events = 0;
+  Qnn_ErrorHandle_t error = FetchEvents(&events_ptr, &num_events);
+  if (error != QNN_SUCCESS) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Failed to profile data in function FetchEvents: %d",
+        QNN_GET_ERROR_CODE(error));
+    return error;
+  }
+  const QnnInterface& qnn_interface = implementation_->GetQnnInterface();
+  QnnProfile_EventData_t event_data;
+  std::uint32_t count_num_events = 0;
+  for (std::uint32_t i = 0; i < num_events; ++i) {
+    error =
+        qnn_interface.qnn_profile_get_event_data(events_ptr[i], &event_data);
+    if (error != QNN_SUCCESS) {
+      QNN_EXECUTORCH_LOG_ERROR(
+          "ProfileData failed to get event data "
+          "for event %d: %d",
+          i,
+          QNN_GET_ERROR_CODE(error));
+      return error;
+    }
+
+    std::ios_base::openmode open_mode = std::ios::app;
+    if (strcmp(event_data.identifier, DEFINE_HEAP_BEFORE_CREATION) == 0) {
+      open_mode = std::ios::trunc;
+    } else if (strcmp(event_data.identifier, DEFINE_HEAP_AFTER_FREED) == 0) {
+      open_mode = std::ios::app;
+    } else {
+      count_num_events++;
+      continue;
+    }
+    std::string identifier =
+        std::string(event_data.identifier) + get_event_unit(event_data.unit);
+    std::ofstream ofs(profile_filename, open_mode);
+    if (!ofs) {
+      QNN_EXECUTORCH_LOG_ERROR(
+          "Error when opening profile file: %s", profile_filename.c_str());
+      return QNN_COMMON_ERROR_GENERAL;
+    }
+    ofs << identifier << ", " << event_data.value << "\n";
+  }
+  if (count_num_events == num_events) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Not HTP backend but enable htp profiling. Please check setting.");
+    return QNN_SUCCESS;
+  }
+  return error;
+}
+
 QnnProfile::~QnnProfile() {
   const QnnInterface& qnn_interface = implementation_->GetQnnInterface();
   if (handle_ != nullptr) {
diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.h b/backends/qualcomm/runtime/backends/QnnProfiler.h
index de8fbd1d9d5..971738a28d6 100644
--- a/backends/qualcomm/runtime/backends/QnnProfiler.h
+++ b/backends/qualcomm/runtime/backends/QnnProfiler.h
@@ -12,6 +12,9 @@
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
 #include <executorch/runtime/core/event_tracer_hooks_delegate.h>
 #include "QnnProfile.h"
+
+#include <string>
+
 namespace executorch {
 namespace backends {
 namespace qnn {
@@ -24,6 +27,7 @@ class QnnProfile {
       const QnnExecuTorchProfileLevel& profile_level);
   ~QnnProfile();
   Qnn_ErrorHandle_t ProfileData(executorch::runtime::EventTracer* event_tracer);
+  Qnn_ErrorHandle_t ProfileDataToFile(const std::string profile_filename);
 
   Qnn_ProfileHandle_t GetHandle() {
     return handle_;
@@ -33,6 +37,10 @@ class QnnProfile {
   Qnn_ProfileHandle_t handle_;
   QnnImplementation* implementation_;
   QnnBackend* backend_;
+
+  Qnn_ErrorHandle_t FetchEvents(
+      const QnnProfile_EventId_t** events_ptr,
+      std::uint32_t* num_events);
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/runtime/backends/gpu/GpuContext.cpp b/backends/qualcomm/runtime/backends/gpu/GpuContext.cpp
index 07952e77eef..c6c6ace2bdf 100644
--- a/backends/qualcomm/runtime/backends/gpu/GpuContext.cpp
+++ b/backends/qualcomm/runtime/backends/gpu/GpuContext.cpp
@@ -21,7 +21,13 @@ GpuContext::GpuContext(
     QnnBackendCache* cache,
     QnnDlcManager* qnn_dlc_manager,
     const QnnExecuTorchGpuBackendOptions* gpu_options)
-    : QnnContext(implementation, backend, device, cache, qnn_dlc_manager) {
+    : QnnContext(
+          implementation,
+          backend,
+          device,
+          cache,
+          qnn_dlc_manager,
+          QnnExecuTorchProfileLevel::kProfileOff) {
   gpu_context_custom_config_ =
       std::make_unique<GpuContextCustomConfig>(gpu_options);
 }
diff --git a/backends/qualcomm/runtime/backends/htp/HtpContext.h b/backends/qualcomm/runtime/backends/htp/HtpContext.h
index a0389ea5983..f00b709f607 100644
--- a/backends/qualcomm/runtime/backends/htp/HtpContext.h
+++ b/backends/qualcomm/runtime/backends/htp/HtpContext.h
@@ -25,10 +25,17 @@ class HtpContext : public QnnContext {
       QnnDevice* device,
       QnnBackendCache* cache,
       const QnnExecuTorchHtpBackendOptions* htp_options,
-      QnnDlcManager* qnn_dlc_manager)
-      : QnnContext(implementation, backend, device, cache, qnn_dlc_manager) {
-    htp_context_custom_config_ =
-        std::make_unique<HtpContextCustomConfig>(this, htp_options);
+      QnnDlcManager* qnn_dlc_manager,
+      const QnnExecuTorchProfileLevel& profile_level)
+      : QnnContext(
+            implementation,
+            backend,
+            device,
+            cache,
+            qnn_dlc_manager,
+            profile_level) {
+    htp_context_custom_config_ = std::make_unique<HtpContextCustomConfig>(
+        this, htp_options, profile_level);
   }
   ~HtpContext() {}
 
diff --git a/backends/qualcomm/runtime/backends/htp/HtpContextCustomConfig.h b/backends/qualcomm/runtime/backends/htp/HtpContextCustomConfig.h
index f0d4873b0d2..64cb279a1c7 100644
--- a/backends/qualcomm/runtime/backends/htp/HtpContextCustomConfig.h
+++ b/backends/qualcomm/runtime/backends/htp/HtpContextCustomConfig.h
@@ -26,12 +26,16 @@ class HtpContextCustomConfig {
  public:
   explicit HtpContextCustomConfig(
       const QnnContext* context,
-      const QnnExecuTorchHtpBackendOptions* htp_options)
-      : context_(context), htp_options_(htp_options) {}
+      const QnnExecuTorchHtpBackendOptions* htp_options,
+      const QnnExecuTorchProfileLevel& profile_level)
+      : profile_level_(profile_level),
+        context_(context),
+        htp_options_(htp_options) {}
 
   std::vector<QnnContext_CustomConfig_t> CreateContextCustomConfig();
 
  private:
+  [[maybe_unused]] QnnExecuTorchProfileLevel profile_level_;
   QnnHtpContext_CustomConfig_t* AllocContextCustomConfig() {
     htp_context_config_.emplace_back(
         std::make_unique<QnnHtpContext_CustomConfig_t>());
diff --git a/backends/qualcomm/runtime/backends/htp/host/HtpContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/htp/host/HtpContextCustomConfig.cpp
index 4850afa14a2..037998132a8 100644
--- a/backends/qualcomm/runtime/backends/htp/host/HtpContextCustomConfig.cpp
+++ b/backends/qualcomm/runtime/backends/htp/host/HtpContextCustomConfig.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/qualcomm/runtime/backends/htp/HtpContext.h>
 #include <executorch/backends/qualcomm/runtime/backends/htp/HtpContextCustomConfig.h>
 
 namespace executorch {
diff --git a/backends/qualcomm/runtime/backends/htp/target/HtpContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/htp/target/HtpContextCustomConfig.cpp
index 676795797f8..8488bf21e79 100644
--- a/backends/qualcomm/runtime/backends/htp/target/HtpContextCustomConfig.cpp
+++ b/backends/qualcomm/runtime/backends/htp/target/HtpContextCustomConfig.cpp
@@ -19,6 +19,17 @@ HtpContextCustomConfig::CreateContextCustomConfig() {
   QnnHtpContext_CustomConfig_t* p_custom_config = nullptr;
   const HtpContext* htp_ctx = static_cast<const HtpContext*>(context_);
 
+  // TODO: Verify heap profile works with kProfileBasic once enabled.
+  if (profile_level_ != QnnExecuTorchProfileLevel::kProfileOff) {
+    QnnHtpContext_CustomConfig_t* p_custom_config_profile = nullptr;
+    p_custom_config_profile = AllocContextCustomConfig();
+    p_custom_config_profile->option =
+        QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED;
+    p_custom_config_profile->dspMemoryProfilingEnabled = true;
+    ret.push_back(
+        static_cast<QnnContext_CustomConfig_t>(p_custom_config_profile));
+  }
+
   if (htp_options_->use_multi_contexts() &&
       htp_options_->max_sf_buf_size() != 0) {
     p_custom_config = AllocContextCustomConfig();
diff --git a/backends/qualcomm/runtime/backends/ir/host/QnnDlcManager.cpp b/backends/qualcomm/runtime/backends/ir/host/QnnDlcManager.cpp
index 47d583b5c15..62d01c78706 100644
--- a/backends/qualcomm/runtime/backends/ir/host/QnnDlcManager.cpp
+++ b/backends/qualcomm/runtime/backends/ir/host/QnnDlcManager.cpp
@@ -47,7 +47,8 @@ Error QnnDlcManager::Create() {
       backend_bundle_ptr_->qnn_backend_ptr.get(),
       backend_bundle_ptr_->qnn_device_ptr.get(),
       backend_params_ptr_->qnn_backend_cache_ptr_.get(),
-      nullptr);
+      nullptr,
+      QnnExecuTorchProfileLevel::kProfileOff);
 
   backend_params_ptr_->qnn_graph_ptr_ = std::make_unique<QnnGraph>(
       backend_bundle_ptr_->implementation.get(),
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp b/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp
index d5203898f6b..e0c9d3ed3d8 100644
--- a/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp
@@ -20,7 +20,13 @@ LpaiContext::LpaiContext(
     QnnDevice* device,
     QnnBackendCache* cache,
     QnnDlcManager* qnn_dlc_manager)
-    : QnnContext(implementation, backend, device, cache, qnn_dlc_manager) {
+    : QnnContext(
+          implementation,
+          backend,
+          device,
+          cache,
+          qnn_dlc_manager,
+          QnnExecuTorchProfileLevel::kProfileOff) {
   lpai_context_custom_config_ = std::make_unique<LpaiContextCustomConfig>();
 }
 
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index d76e3ea1df7..c44ac6efb59 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -5528,6 +5528,42 @@ def test_qnn_backend_profile_op(self):
         )
         TestQNN.profile_level = 0
 
+    def test_qnn_backend_runtime_option_heap_profile(self):
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=True,
+            use_multi_contexts=True,
+        )
+
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.chipset_table[TestQNN.soc_model],
+            backend_options=backend_options,
+            profile_level=2,  # if 0 for closing heap profiling
+        )
+
+        pass_jobs = get_capture_program_passes()
+        split_graph_pass, setting = self.split_graph(4)
+        pass_jobs[split_graph_pass] = setting
+        dep_table = get_passes_dependency_for_capture_program()
+        dep_table[split_graph_pass] = [FoldQDQ]
+
+        edge_prog_mgr = to_edge_transform_and_lower_to_qnn(
+            module=module,
+            inputs=sample_input,
+            compiler_specs=compiler_specs,
+            dep_table=dep_table,
+            passes_job=pass_jobs,
+        )
+        exec_prog = edge_prog_mgr.to_executorch()
+        self.verify_output(
+            module,
+            sample_input,
+            exec_prog,
+            save_heap_result=True,
+        )
+
     def test_qnn_backend_runtime_option_htp_performance(self):
         backend_options = generate_htp_compiler_spec(use_fp16=True)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
@@ -6432,6 +6468,43 @@ def test_qnn_backend_profile_op(self):
         )
         TestQNN.profile_level = 0
 
+    def test_qnn_backend_runtime_option_heap_profile(self):
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        module1 = self.get_qdq_module(module, sample_input)
+
+        backend_options = generate_htp_compiler_spec(
+            use_fp16=False,
+            use_multi_contexts=True,
+        )
+
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.chipset_table[TestQNN.soc_model],
+            backend_options=backend_options,
+            profile_level=2,  # if 0 for closing heap profiling
+        )
+
+        pass_jobs = get_capture_program_passes()
+        split_graph_pass, setting = self.split_graph(4)
+        pass_jobs[split_graph_pass] = setting
+        dep_table = get_passes_dependency_for_capture_program()
+        dep_table[split_graph_pass] = [FoldQDQ]
+
+        edge_prog_mgr = to_edge_transform_and_lower_to_qnn(
+            module=module1,
+            inputs=sample_input,
+            compiler_specs=compiler_specs,
+            dep_table=dep_table,
+            passes_job=pass_jobs,
+        )
+        exec_prog = edge_prog_mgr.to_executorch()
+        self.verify_output(
+            module1,
+            sample_input,
+            exec_prog,
+            save_heap_result=True,
+        )
+
     def test_qnn_backend_runtime_option_htp_performance(self):
         backend_options = generate_htp_compiler_spec(use_fp16=False)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 93a6dd81f73..1bd0ac1d4c5 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -8,6 +8,7 @@
 import subprocess
 import tempfile
 import unittest
+from pathlib import Path
 from typing import Callable, Dict, List, Optional, OrderedDict, Tuple
 
 import numpy as np
@@ -191,6 +192,7 @@ class TestQNN(unittest.TestCase):
     inference_speed_output_path = "outputs/inference_speed.txt"
     static_llm_eval_method = ""
     direct_build_folder: str = ""
+    dsp_heap_profile_filename = "htp_heap_usage.txt"
 
     @classmethod
     def setUpClass(cls):
@@ -332,6 +334,7 @@ def verify_output(  # noqa: C901
         save_inference_speed: bool = False,
         expected_compared_events: int = -1,
         qnn_intermediate_debugger: QNNIntermediateDebugger = None,
+        save_heap_result: bool = False,
     ):
         with tempfile.TemporaryDirectory() as tmp_dir:
             (
@@ -385,6 +388,23 @@ def validate_profile():
                     len(inspector.to_dataframe().index) >= expected_profile_events
                 )
 
+            def validate_heap_profile():
+                file_path = f"{tmp_dir}/{self.dsp_heap_profile_filename}"
+                self.assertTrue(
+                    Path(file_path).exists(), f"File not found: {file_path}"
+                )
+                with open(file_path, "r") as f:
+                    values = [
+                        int(line.split(",")[1].strip()) for line in f if line.strip()
+                    ]
+                self.assertEqual(len(values), 2, f"Expected 2 entries, got {values}")
+                before, after = values
+                difference = after - before
+
+                print(f"before_context_created: {before} bytes")
+                print(f"after_context_freed: {after} bytes")
+                print(f"difference: {difference:.2f} bytes")
+
             def validate_intermediate_tensor():
                 inspector = Inspector(
                     etdump_path=etdump_path, debug_buffer_path=debug_output_path
@@ -547,6 +567,11 @@ def validate_intermediate_tensor():
                     adb.extra_cmds += (
                         f" --performance_output_path {self.inference_speed_output_path}"
                     )
+
+                if save_heap_result:
+                    adb.extra_cmds += (
+                        f" --heap_profiling_path {self.dsp_heap_profile_filename}"
+                    )
                 adb.execute(custom_runner_cmd=f"rm -rf {adb.output_folder}")
                 adb.execute(method_index=method_index, output_callback=output_callback)
                 adb.pull(host_output_path=tmp_dir, callback=post_process)
@@ -566,6 +591,12 @@ def validate_intermediate_tensor():
                         f"{tmp_dir}/{self.inference_speed_output_path}", "r"
                     ) as f:
                         self.inference_speed = float(f.read())
+                if save_heap_result:
+                    adb.pull_heap_output(
+                        f"{adb.workspace}/{self.dsp_heap_profile_filename}",
+                        f"{tmp_dir}/{self.dsp_heap_profile_filename}",
+                        callback=validate_heap_profile,
+                    )
 
     def lower_module_and_test_output(
         self,