From 7b15ff31432d61a2d2e164713f857cacaeecb886 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Fri, 6 Dec 2024 08:53:01 -0800
Subject: [PATCH 1/2] [ET-VK] Removing unnecessary and redundant members from
 VulkanBuffer and ParamsBuffer.

Pull Request resolved: https://github.com/pytorch/executorch/pull/7144

This diff removes unnecessary and redundant members from VulkanBuffer and ParamsBuffer.
ghstack-source-id: 256911526
@exported-using-ghexport

Differential Revision: [D66557456](https://our.internmc.facebook.com/intern/diff/D66557456/)
---
 backends/vulkan/runtime/api/containers/ParamsBuffer.h | 5 +----
 backends/vulkan/runtime/vk_api/memory/Buffer.cpp      | 9 ++-------
 backends/vulkan/runtime/vk_api/memory/Buffer.h        | 1 -
 3 files changed, 3 insertions(+), 12 deletions(-)
diff --git a/backends/vulkan/runtime/api/containers/ParamsBuffer.h b/backends/vulkan/runtime/api/containers/ParamsBuffer.h
index c3f19028edc..fe157c5e014 100644
--- a/backends/vulkan/runtime/api/containers/ParamsBuffer.h
+++ b/backends/vulkan/runtime/api/containers/ParamsBuffer.h
@@ -20,7 +20,6 @@ namespace api {
 class ParamsBuffer final {
  private:
   Context* context_p_;
-  size_t nbytes_;
   vkapi::VulkanBuffer vulkan_buffer_;
 
  public:
@@ -29,14 +28,12 @@ class ParamsBuffer final {
   template <typename Block>
   ParamsBuffer(Context* context_p, const Block& block)
       : context_p_(context_p),
-        nbytes_(sizeof(block)),
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}
 
   template <typename Block>
   ParamsBuffer(Context* context_p, const VkDeviceSize nbytes)
       : context_p_(context_p),
-        nbytes_(nbytes),
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {}
 
@@ -70,7 +67,7 @@ class ParamsBuffer final {
   template <typename T>
   T read() const {
     T val;
-    if (sizeof(val) != nbytes_) {
+    if (sizeof(val) != vulkan_buffer_.mem_size()) {
       VK_THROW(
           "Attempted to store value from ParamsBuffer to type of different size");
     }
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
index 9fa3c2ac776..4f58e07b146 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
@@ -29,12 +29,7 @@ VulkanBuffer::VulkanBuffer(
     const VmaAllocationCreateInfo& allocation_create_info,
     const VkBufferUsageFlags usage,
     const bool allocate_memory)
-    : buffer_properties_({
-          size,
-          0u,
-          size,
-          usage,
-      }),
+    : buffer_properties_({size, 0u, size}),
       allocator_(vma_allocator),
       memory_{},
       owns_memory_(allocate_memory),
@@ -52,7 +47,7 @@ VulkanBuffer::VulkanBuffer(
       nullptr, // pNext
       0u, // flags
       buffer_properties_.size, // size
-      buffer_properties_.buffer_usage, // usage
+      usage, // usage
       VK_SHARING_MODE_EXCLUSIVE, // sharingMode
       0u, // queueFamilyIndexCount
       nullptr, // pQueueFamilyIndices
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
index 347c5dd917b..0ef9f7e95e4 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.h
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -48,7 +48,6 @@ class VulkanBuffer final {
     VkDeviceSize size;
     VkDeviceSize mem_offset;
     VkDeviceSize mem_range;
-    VkBufferUsageFlags buffer_usage;
   };
 
   explicit VulkanBuffer();

From c9a573927509db308e4509cbbbba50acfca87742 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Fri, 6 Dec 2024 08:53:02 -0800
Subject: [PATCH 2/2] [ET-VK] Store unique ptr to Tensor in Value instead of
 inlined tensor object, to reduce Value struct size from 448 to 80 bytes.

Pull Request resolved: https://github.com/pytorch/executorch/pull/7145

This diff aims to reduce the size of the Value struct in the Executorch Vulkan runtime by storing a unique pointer to the Tensor object instead of an inlined tensor object. This change reduces the size of the Value struct from 448 bytes to 80 bytes, which can improve performance and reduce memory usage.
ghstack-source-id: 256911524
@exported-using-ghexport

Differential Revision: [D66655991](https://our.internmc.facebook.com/intern/diff/D66655991/)
---
 .../vulkan/runtime/graph/containers/Value.h   | 55 ++++++++++++++-----
 .../vulkan/test/vulkan_compute_api_test.cpp   |  4 +-
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
index 8773f0c0b04..83669c85b1c 100644
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ b/backends/vulkan/runtime/graph/containers/Value.h
@@ -58,7 +58,7 @@ struct Value final {
       bool as_bool;
     } u;
 
-    api::vTensor as_tensor;
+    std::unique_ptr<api::vTensor> as_tensor;
     api::StagingBuffer as_staging;
     TensorRef as_tensorref;
 
@@ -106,15 +106,18 @@ struct Value final {
     rhs.payload.member_name.~dtor_name();                                \
     break;
 
+#define CASE_MOVE_UNIQUE_PTR_TYPE(type_tag, member_name)      \
+  case type_tag:                                              \
+    payload.member_name = std::move(rhs.payload.member_name); \
+    break;
+
   Value(Value&& rhs) noexcept : tag(rhs.tag) {
     switch (tag) {
       // Scalar types
       CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::INT, as_int);
       CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::DOUBLE, as_double);
       CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::BOOL, as_bool);
-      // Tensor and tensor adjacent types
-      CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::TENSOR, api::vTensor, as_tensor, vTensor);
+      // Tensor adjacent types
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::STAGING, api::StagingBuffer, as_staging, StagingBuffer);
       CASE_MOVE_MOVEABLE_TYPE(
@@ -132,6 +135,8 @@ struct Value final {
       CASE_MOVE_MOVEABLE_TYPE(
           TypeTag::STRING, std::string, as_string, basic_string);
       CASE_MOVE_MOVEABLE_TYPE(TypeTag::SYMINT, SymInt, as_symint, SymInt);
+      // Tensor type
+      CASE_MOVE_UNIQUE_PTR_TYPE(TypeTag::TENSOR, as_tensor);
 
       case TypeTag::NONE:
         clearToNone();
@@ -142,6 +147,7 @@ struct Value final {
 
 #undef CASE_MOVE_TRIVIALLY_COPYABLE_TYPE
 #undef CASE_MOVE_MOVEABLE_TYPE
+#undef CASE_MOVE_UNIQUE_PTR_TYPE
 
   //
   // Accessors
@@ -157,9 +163,6 @@ struct Value final {
 
   ~Value() {
     switch (tag) {
-      case TypeTag::TENSOR:
-        payload.as_tensor.~vTensor();
-        break;
       case TypeTag::STAGING:
         payload.as_staging.~StagingBuffer();
         break;
@@ -184,6 +187,9 @@ struct Value final {
       case TypeTag::SYMINT:
         payload.as_symint.~SymInt();
         break;
+      case TypeTag::TENSOR:
+        payload.as_tensor.reset();
+        break;
       // Manually list out the types so that if a type here is added later and
       // not handled the compiler can catch it.
       case TypeTag::NONE:
@@ -252,12 +258,6 @@ struct Value final {
     return payload.member_name;                             \
   }
 
-  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      api::vTensor,
-      Tensor,
-      TypeTag::TENSOR,
-      as_tensor);
-
   SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
       api::StagingBuffer,
       Staging,
@@ -302,9 +302,36 @@ struct Value final {
 
   SUPPORT_TRIVIALLY_MOVEABLE_TYPE(SymInt, SymInt, TypeTag::SYMINT, as_symint);
 
-#undef SUPPORT_TRIVIALLY_COPYABLE_TYPE
 #undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE
 
+#define SUPPORT_UNIQUE_PTR_TYPE(type, type_name, type_tag, member_name) \
+  explicit Value(type t) : tag(type_tag) {                              \
+    payload.member_name = std::make_unique<type>(std::move(t));         \
+  }                                                                     \
+  inline bool is##type_name() const {                                   \
+    return tag == type_tag;                                             \
+  }                                                                     \
+  inline type& to##type_name() const {                                  \
+    VK_CHECK_COND(                                                      \
+        is##type_name(),                                                \
+        "Expected value to have type " #type_name ", got ",             \
+        tag,                                                            \
+        " instead.");                                                   \
+    return *payload.member_name;                                        \
+  }                                                                     \
+  inline const type& toConst##type_name() const {                       \
+    VK_CHECK_COND(                                                      \
+        is##type_name(),                                                \
+        "Expected value to have type " #type_name ", got ",             \
+        tag,                                                            \
+        " instead.");                                                   \
+    return *payload.member_name;                                        \
+  }
+
+  SUPPORT_UNIQUE_PTR_TYPE(api::vTensor, Tensor, TypeTag::TENSOR, as_tensor);
+
+#undef SUPPORT_UNIQUE_PTR_TYPE
+
  private:
   Payload payload;
   TypeTag tag;
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index ee49f95ee21..6e491bed22e 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1087,8 +1087,8 @@ TEST_F(VulkanComputeAPITest, print_object_sizes) {
 
   // Current known size on 64 bit system: 1040 B
   EXPECT_TRUE(sizeof(vTensor) < 1200);
-  // Current known size on 64 bit system: 1056 B
-  EXPECT_TRUE(sizeof(Value) < 1200);
+  // Current known size on 64 bit system: 120 B
+  EXPECT_TRUE(sizeof(Value) < 128);
   // Current known size on 64 bit system: 120 B
   EXPECT_TRUE(sizeof(StagingBuffer) < 500);
   // Current known size on 64 bit system: 384 B