From 12fe06b38543eb05043a9b813b36f3d4cdcfc0b7 Mon Sep 17 00:00:00 2001
From: Riley Dulin <dulinr@meta.com>
Date: Fri, 9 Aug 2024 15:09:26 -0700
Subject: [PATCH] Fix bundled program and plan_execute in pybindings (#4595)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4595

Bundled programs needed to use the `plan_execute` function on ExecutorchModule
because the inputs were previously loaded. Unfortunately this meant that the
programs didn't work if they did not have pre-planned outputs
(i.e. the caller is supposed to provide the memory).

Share code for allocating outputs as necessary between `run_method` and `plan_execute`.
Also share code for *returning* those outputs, as there doesn't seem to be a reason to
execute the method without getting its output.
In order for the output storage to live long enough for the bundled program comparison,
add the storages as private data members of `PyModule`.

As an improvement in user API, have `verify_result_with_bundled_expected_output` do all of these things
at once for the user.

Differential Revision: D60939315
---
 extension/pybindings/pybindings.cpp | 205 +++++++++++++++++-----------
 extension/pybindings/pybindings.pyi |   5 +-
 2 files changed, 130 insertions(+), 80 deletions(-)

diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index 1f6d1075a1e..0dba7603294 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -95,6 +95,34 @@ void write_data_to_file(const std::string& path, void* buf, size_t size) {
   }
 }
 
+void setup_output_storage(
+    executor::Method& method,
+    const std::vector<Span<uint8_t>>& output_storages) {
+  if (output_storages.size() != method.outputs_size()) {
+    THROW_IF_ERROR(
+        Error(),
+        "number of output storages %zu does not match number of outputs %zu",
+        output_storages.size(),
+        method.outputs_size());
+  }
+  for (size_t i = 0; i < output_storages.size(); ++i) {
+    if (output_storages[i].size() == 0) {
+      // Skip empty output storages, this would happen for non-tensor outputs.
+      continue;
+    }
+    Error output_status = method.set_output_data_ptr(
+        output_storages[i].data(), output_storages[i].size(), i);
+    // InvalidState can be the status if outputs are already memory planned.
+    // That's fine and we don't need to alert the user to that error.
+    if (output_status != Error::Ok && output_status != Error::InvalidState) {
+      ET_LOG(
+          Error,
+          "Cannot set_output_data_ptr(): 0x%" PRIx32,
+          static_cast<uint32_t>(output_status));
+    }
+  }
+}
+
 using util::BufferDataLoader;
 using util::MallocMemoryAllocator;
 using util::MmapDataLoader;
@@ -209,26 +237,7 @@ class Module final {
         c10::autograd_dispatch_keyset);
 #endif
     if (output_storages) {
-      if (output_storages->size() != method->outputs_size()) {
-        THROW_IF_ERROR(
-            Error(),
-            "number of output storages %zu does not match number of outputs %zu",
-            output_storages->size(),
-            method->outputs_size());
-      }
-      for (size_t i = 0; i < output_storages->size(); ++i) {
-        Error output_status = method->set_output_data_ptr(
-            (*output_storages)[i].data(), (*output_storages)[i].size(), i);
-        // InvalidState can be the status if outputs are already memory planned.
-        // That's fine and we don't need to alert the user to that error.
-        if (output_status != Error::Ok &&
-            output_status != Error::InvalidState) {
-          ET_LOG(
-              Error,
-              "Cannot set_output_data_ptr(): 0x%" PRIx32,
-              static_cast<uint32_t>(output_status));
-        }
-      }
+      setup_output_storage(*method, *output_storages);
     }
     Error execute_status = method->execute();
     THROW_IF_ERROR(
@@ -236,6 +245,11 @@ class Module final {
         "method->execute() failed with error 0x%" PRIx32,
         static_cast<uint32_t>(execute_status));
     // process outputs
+    return get_outputs(method_name);
+  }
+
+  std::vector<EValue> get_outputs(const std::string& method_name) {
+    auto& method = methods_[method_name];
     std::vector<EValue> result(method->outputs_size());
 
     Error get_outputs_status =
@@ -556,62 +570,17 @@ struct PyModule final {
 
     const auto& method = module_->get_method(method_name);
     const auto num_outputs = method.outputs_size();
-    // These output storages will not be used if the ExecuTorch program already
-    // pre-allocated output space. That is represented by an error from
-    // set_output_data_ptr.
-    std::vector<std::unique_ptr<uint8_t[]>> output_storages(num_outputs);
+    output_storages_ = make_output_storages(method);
     std::vector<Span<uint8_t>> output_storage_spans(num_outputs);
-    for (size_t i = 0; i < num_outputs; ++i) {
-      const auto& output_tensor_meta =
-          method.method_meta().output_tensor_meta(i);
-      if (!output_tensor_meta.ok()) {
-        // If the output isn't a tensor it won't have a tensor meta.
-        ET_LOG(
-            Info,
-            "Tensor meta doesn't exist for output %zu, error is 0x%" PRIx32
-            ", skipping allocating storage",
-            i,
-            static_cast<uint32_t>(output_tensor_meta.error()));
-        output_storage_spans[i] = Span<uint8_t>();
-        continue;
-      }
-      const size_t output_size = output_tensor_meta.get().nbytes();
-      std::unique_ptr<uint8_t[]> output(new uint8_t[output_size]);
-      output_storage_spans[i] = Span<uint8_t>(output.get(), output_size);
-      output_storages[i] = std::move(output);
+    for (int i = 0; i < output_storages_.size(); ++i) {
+      output_storage_spans[i] =
+          Span<uint8_t>(output_storages_[i].data(), output_storages_[i].size());
     }
     auto outputs =
         module_->run_method(method_name, cpp_inputs, output_storage_spans);
 
     // Retrieve outputs
-    const auto outputs_size = outputs.size();
-    py::list list(outputs_size);
-    for (size_t i = 0; i < outputs_size; ++i) {
-      auto& v = outputs[i];
-      if (Tag::None == v.tag) {
-        list[i] = py::none();
-      } else if (Tag::Int == v.tag) {
-        list[i] = py::cast(v.toInt());
-      } else if (Tag::Double == v.tag) {
-        list[i] = py::cast(v.toDouble());
-      } else if (Tag::Bool == v.tag) {
-        list[i] = py::cast(v.toBool());
-      } else if (Tag::String == v.tag) {
-        list[i] = py::cast(std::string(v.toString().data()));
-      } else if (Tag::Tensor == v.tag) {
-#ifdef USE_ATEN_LIB
-        // Clone so the outputs in python do not share a lifetime with the
-        // module object
-        list[i] = py::cast(v.toTensor().clone());
-#else
-        list[i] = py::cast(
-            torch::util::alias_attensor_to_etensor(v.toTensor()).clone());
-#endif
-      } else {
-        ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
-      }
-    }
-    return list;
+    return get_outputs_as_py_list(outputs);
   }
 
   py::list forward(const py::sequence& inputs) {
@@ -670,35 +639,113 @@ struct PyModule final {
         static_cast<uint32_t>(status));
   }
 
-  void verify_result_with_bundled_expected_output(
+  py::list verify_result_with_bundled_expected_output(
       PyBundledModule& m,
       const string method_name,
       size_t testset_idx,
       double rtol = 1e-5,
       double atol = 1e-8) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
-    Error status = bundled_program::VerifyResultWithBundledExpectedOutput(
-        module_->get_method(method_name),
-        bundled_program_ptr,
-        testset_idx,
-        rtol,
-        atol);
+    auto& method = module_->get_method(method_name);
+    Error status = bundled_program::LoadBundledInput(
+        method, bundled_program_ptr, testset_idx);
+    THROW_IF_ERROR(
+        status,
+        "LoadBundledInput failed with status %" PRIu32,
+        static_cast<uint32_t>(status));
+    py::list outputs = plan_execute(method_name);
+    status = bundled_program::VerifyResultWithBundledExpectedOutput(
+        method, bundled_program_ptr, testset_idx, rtol, atol);
     THROW_IF_ERROR(
         status,
         "Result verification failed with status %" PRIu32,
         static_cast<uint32_t>(status));
+    return outputs;
   }
 
-  void plan_execute(const string method_name) {
-    auto status = module_->get_method(method_name).execute();
+  py::list plan_execute(const string method_name) {
+    auto& method = module_->get_method(method_name);
+    // Need to pre-allocate space for outputs just like in run_method.
+    const auto num_outputs = method.outputs_size();
+    output_storages_ = make_output_storages(method);
+    std::vector<Span<uint8_t>> output_storage_spans(num_outputs);
+    for (int i = 0; i < output_storages_.size(); ++i) {
+      output_storage_spans[i] =
+          Span<uint8_t>(output_storages_[i].data(), output_storages_[i].size());
+    }
+    setup_output_storage(method, output_storage_spans);
+    auto status = method.execute();
     THROW_IF_ERROR(
         status,
         "executing execution plan for method 'forward' failed with error: 0x%" PRIx32,
         static_cast<uint32_t>(status));
+    const auto outputs = module_->get_outputs(method_name);
+    return get_outputs_as_py_list(outputs);
+  }
+
+  py::list get_outputs_as_py_list(const std::vector<EValue>& outputs) {
+    const auto outputs_size = outputs.size();
+    py::list list(outputs_size);
+    for (size_t i = 0; i < outputs_size; ++i) {
+      auto& v = outputs[i];
+      if (Tag::None == v.tag) {
+        list[i] = py::none();
+      } else if (Tag::Int == v.tag) {
+        list[i] = py::cast(v.toInt());
+      } else if (Tag::Double == v.tag) {
+        list[i] = py::cast(v.toDouble());
+      } else if (Tag::Bool == v.tag) {
+        list[i] = py::cast(v.toBool());
+      } else if (Tag::String == v.tag) {
+        list[i] = py::cast(std::string(v.toString().data()));
+      } else if (Tag::Tensor == v.tag) {
+#ifdef USE_ATEN_LIB
+        // Clone so the outputs in python do not share a lifetime with the
+        // module object
+        list[i] = py::cast(v.toTensor().clone());
+#else
+        list[i] = py::cast(
+            torch::util::alias_attensor_to_etensor(v.toTensor()).clone());
+#endif
+      } else {
+        ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
+      }
+    }
+    return list;
   }
 
  private:
   std::unique_ptr<Module> module_;
+  // Need to keep-alive output storages until they can be compared in case of
+  // bundled programs.
+  std::vector<std::vector<uint8_t>> output_storages_;
+
+  std::vector<std::vector<uint8_t>> make_output_storages(
+      const executor::Method& method) {
+    const auto num_outputs = method.outputs_size();
+    // These output storages will not be used if the ExecuTorch program already
+    // pre-allocated output space. That is represented by an error from
+    // set_output_data_ptr.
+    std::vector<std::vector<uint8_t>> output_storages(num_outputs);
+    for (size_t i = 0; i < num_outputs; ++i) {
+      const auto& output_tensor_meta =
+          method.method_meta().output_tensor_meta(i);
+      if (!output_tensor_meta.ok()) {
+        // If the output isn't a tensor it won't have a tensor meta.
+        ET_LOG(
+            Error,
+            "Tensor meta doesn't exist for output %zu, error is 0x%" PRIx32
+            ", skipping allocating storage",
+            i,
+            static_cast<uint32_t>(output_tensor_meta.error()));
+        output_storages[i] = std::vector<uint8_t>();
+        continue;
+      }
+      const size_t output_size = output_tensor_meta.get().nbytes();
+      output_storages[i] = std::vector<uint8_t>(output_size);
+    }
+    return output_storages;
+  }
 };
 
 void create_profile_block(const std::string& name) {
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index a0859116724..e02ae0046f1 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -14,10 +14,13 @@ class ExecuTorchModule:
     def run_method(self, method_name: str, inputs: Sequence[Any]) -> List[Any]: ...
     # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def forward(self, inputs: Sequence[Any]) -> List[Any]: ...
+    # pyre-ignore[3]: "Any" in return type annotations.
+    def plan_execute(self) -> List[Any]: ...
     # Bundled program methods.
     def load_bundled_input(
         self, bundle: BundledModule, method_name: str, testset_idx: int
     ) -> None: ...
+    # pyre-ignore[3]: "Any" in return type annotations.
     def verify_result_with_bundled_expected_output(
         self,
         bundle: BundledModule,
@@ -25,7 +28,7 @@ class ExecuTorchModule:
         testset_idx: int,
         rtol: float = 1e-5,
         atol: float = 1e-8,
-    ) -> None: ...
+    ) -> List[Any]: ...
     def has_etdump(self) -> bool: ...
     def write_etdump_result_to_file(
         self, path: str, debug_buffer_path: Optional[str] = None