From f658a42eab7c0536f7f8e65600d72ea56e6c24ac Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 2 Jan 2025 13:05:28 -0800
Subject: [PATCH 1/3] [ET-VK][ez] Fix undefined behaviour in ambiguous
 `ParamsBuffer` constructor

## Context

I discovered this bug when trying to execute the `vulkan_compute_api_test` binary on Windows. Almost all the tests were failing, with compute shaders producing incorrect results. After bisecting the change, it turns out the culprit is https://github.com/pytorch/executorch/pull/7015. The diff introduced an alternative templated constructor for `ParamsBuffer` which would initialize an empty UBO with a specified size instead of wrapping a pre-existing object.

The issue is that these constructors are ambiguous because they both are template constructors and both only accept one argument. Therefore, the original constructor would be called when certain callsites intended to call the new constructor. This results in a UBO being created with an incorrect size, and resulted in the tensor's metadata being passed incorrectly into a compute shader.

To fix, I added a dummy argument into the new constructor for disambiguation purposes. I also changed it so that it's not templated, since there's no reason for it to be templated.

Differential Revision: [D67770791](https://our.internmc.facebook.com/intern/diff/D67770791/)

ghstack-source-id: 260031108
Pull Request resolved: https://github.com/pytorch/executorch/pull/7478
---
 backends/vulkan/runtime/api/containers/ParamsBuffer.h | 5 +++--
 backends/vulkan/runtime/api/containers/Tensor.cpp     | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/backends/vulkan/runtime/api/containers/ParamsBuffer.h b/backends/vulkan/runtime/api/containers/ParamsBuffer.h
index fe157c5e014..ecc07892cf7 100644
--- a/backends/vulkan/runtime/api/containers/ParamsBuffer.h
+++ b/backends/vulkan/runtime/api/containers/ParamsBuffer.h
@@ -31,8 +31,9 @@ class ParamsBuffer final {
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}
 
-  template <typename Block>
-  ParamsBuffer(Context* context_p, const VkDeviceSize nbytes)
+  // The last bool argument, though unused, is required to disambiguate this
+  // constructor from the one above.
+  ParamsBuffer(Context* context_p, const VkDeviceSize nbytes, const bool unused)
       : context_p_(context_p),
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {}
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 21b0ee4b176..92e310d36de 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -659,7 +659,7 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
   }
   if (sizes_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
@@ -674,7 +674,7 @@ const vkapi::BufferBindInfo vTensor::sizes_ubo() {
 
 const vkapi::BufferBindInfo vTensor::strides_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
   }
   if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
@@ -691,7 +691,7 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
   }
   if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
@@ -707,7 +707,7 @@ const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize);
+    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
   }
   if (numel_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(

From bd59382936997c9fd0ab594deaed9d95513c6789 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 2 Jan 2025 13:05:30 -0800
Subject: [PATCH 2/3] [ET-VK] Create Pipeline layouts with push constant ranges
 when required

## Context

https://github.com/pytorch/executorch/pull/7223 added the ability to use push constants in shaders. However, one thing the diff missed was not specifying that the compute pipeline layout needed to include a push constant upon creation. The Vulkan validation layers warns against this, and on certain GPUs such as the integrated Intel GPU on my windows laptop compute shaders will produce incorrect output.

This diff makes the change such that the compute pipeline layout will be created with a push constant block if necessary.

## Solution

Change the key of the pipeline layout cache to accept an additional push constant size field. The push constant size will be used to create the pipeline layout with a push constant block of the specified size.

Differential Revision: [D67770793](https://our.internmc.facebook.com/intern/diff/D67770793/)

ghstack-source-id: 260031109
Pull Request resolved: https://github.com/pytorch/executorch/pull/7479
---
 backends/vulkan/runtime/api/Context.cpp       |  9 +++---
 backends/vulkan/runtime/api/Context.h         |  9 ++++--
 .../vulkan/runtime/graph/ops/DispatchNode.cpp | 23 +++++++-------
 .../vulkan/runtime/graph/ops/PrepackNode.cpp  |  4 +--
 backends/vulkan/runtime/vk_api/Pipeline.cpp   | 31 +++++++++++++++----
 backends/vulkan/runtime/vk_api/Pipeline.h     | 16 ++++++----
 6 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 5426ea4e60b..9517941f364 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -90,12 +90,13 @@ void Context::report_shader_dispatch_end() {
 vkapi::DescriptorSet Context::get_descriptor_set(
     const vkapi::ShaderInfo& shader_descriptor,
     const utils::uvec3& local_workgroup_size,
-    const vkapi::SpecVarList& additional_constants) {
+    const vkapi::SpecVarList& additional_constants,
+    const uint32_t push_constants_size) {
   VkDescriptorSetLayout shader_layout =
       shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
 
   VkPipelineLayout pipeline_layout =
-      pipeline_layout_cache().retrieve(shader_layout);
+      pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
 
   vkapi::SpecVarList spec_constants = {
       SV(local_workgroup_size[0u]),
@@ -105,7 +106,7 @@ vkapi::DescriptorSet Context::get_descriptor_set(
   spec_constants.append(additional_constants);
 
   VkPipeline pipeline = pipeline_cache().retrieve(
-      {pipeline_layout_cache().retrieve(shader_layout),
+      {pipeline_layout_cache().retrieve(shader_layout, push_constants_size),
        shader_cache().retrieve(shader_descriptor),
        spec_constants});
 
@@ -151,7 +152,7 @@ void Context::register_shader_dispatch(
     const VkDescriptorSetLayout shader_layout =
         shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
     const VkPipelineLayout pipeline_layout =
-        pipeline_layout_cache().retrieve(shader_layout);
+        pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
     cmd_.set_push_constants(
         pipeline_layout, push_constants_data, push_constants_size);
   }
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index 65f3adb511d..300fd3995dd 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -188,12 +188,13 @@ class Context final {
   vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo&,
       const utils::uvec3&,
-      const vkapi::SpecVarList&);
+      const vkapi::SpecVarList&,
+      const uint32_t push_constants_size);
 
   inline vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo& shader_descriptor,
       const utils::uvec3& local_work_group_size) {
-    return get_descriptor_set(shader_descriptor, local_work_group_size, {});
+    return get_descriptor_set(shader_descriptor, local_work_group_size, {}, 0u);
   }
 
   void register_shader_dispatch(
@@ -333,8 +334,10 @@ inline bool Context::submit_compute_job(
       dispatch_id);
 
   // Factor out template parameter independent code to minimize code bloat.
+  // Note that push constants are not exposed yet via this API, therefore the
+  // push constants size is assumed to be 0.
   vkapi::DescriptorSet descriptor_set = get_descriptor_set(
-      shader, local_work_group_size, specialization_constants);
+      shader, local_work_group_size, specialization_constants, 0u);
 
   detail::bind(
       descriptor_set,
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
index 87b4b5b5480..a163a0d7aea 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -60,14 +60,24 @@ void DispatchNode::encode(ComputeGraph* graph) {
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
+  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
+  uint32_t push_constants_offset = 0;
+
+  for (const auto& push_constant : push_constants_) {
+    push_constants_offset += push_constant.write(
+        push_constants_data.data(),
+        push_constants_offset,
+        kMaxPushConstantSize);
+  }
+
   context->report_shader_dispatch_start(
       shader_.kernel_name,
       global_workgroup_size_,
       local_workgroup_size_,
       node_id_);
 
-  vkapi::DescriptorSet descriptor_set =
-      context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
+  vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
+      shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
 
   uint32_t idx = 0;
   idx = bind_values_to_descriptor_set(
@@ -75,15 +85,6 @@ void DispatchNode::encode(ComputeGraph* graph) {
 
   bind_params_to_descriptor_set(params_, descriptor_set, idx);
 
-  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
-  uint32_t push_constants_offset = 0;
-
-  for (const auto& push_constant : push_constants_) {
-    push_constants_offset += push_constant.write(
-        push_constants_data.data(),
-        push_constants_offset,
-        kMaxPushConstantSize);
-  }
   context->register_shader_dispatch(
       descriptor_set,
       pipeline_barrier,
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index 89719fb0dd3..e27723468ab 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -75,8 +75,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
 
   {
     vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::DescriptorSet descriptor_set =
-        context->get_descriptor_set(shader_, local_workgroup_size_, spec_vars_);
+    vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
+        shader_, local_workgroup_size_, spec_vars_, 0u);
 
     uint32_t idx = 0;
     bind_tensor_to_descriptor_set(
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp
index 49bbf083359..3856d406c24 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.cpp
+++ b/backends/vulkan/runtime/vk_api/Pipeline.cpp
@@ -205,17 +205,29 @@ bool operator==(const SpecVarList& lhs, const SpecVarList& rhs) {
 
 PipelineLayout::PipelineLayout(
     VkDevice device,
-    VkDescriptorSetLayout descriptor_layout)
+    VkDescriptorSetLayout descriptor_layout,
+    const uint32_t push_constants_size)
     : device_(device), handle_{VK_NULL_HANDLE} {
-  // TODO: Enable push constants
+  VkPushConstantRange pc_range{
+      VK_SHADER_STAGE_COMPUTE_BIT, // stageFlags
+      0u, // offset
+      push_constants_size, // size
+  };
+  uint32_t num_push_constants = 0u;
+  VkPushConstantRange* pc_ranges_ptr = nullptr;
+  if (push_constants_size > 0u) {
+    num_push_constants = 1u;
+    pc_ranges_ptr = &pc_range;
+  }
+
   const VkPipelineLayoutCreateInfo pipeline_layout_create_info{
       VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
       nullptr, // pNext
       0u, // flags
       1u, // setLayoutCount
       &descriptor_layout, // pSetLayouts
-      0u, // pushConstantRangeCount
-      nullptr, // pPushConstantRanges
+      num_push_constants, // pushConstantRangeCount
+      pc_ranges_ptr, // pPushConstantRanges
   };
 
   VK_CHECK(vkCreatePipelineLayout(
@@ -344,12 +356,19 @@ PipelineLayoutCache::~PipelineLayoutCache() {
 }
 
 VkPipelineLayout PipelineLayoutCache::retrieve(
-    const PipelineLayoutCache::Key& key) {
+    const VkDescriptorSetLayout layout,
+    const uint32_t push_constants_size) {
+  PipelineLayoutCache::Key key{layout, push_constants_size};
   std::lock_guard<std::mutex> lock(cache_mutex_);
 
   auto it = cache_.find(key);
   if (cache_.cend() == it) {
-    it = cache_.insert({key, PipelineLayoutCache::Value(device_, key)}).first;
+    it = cache_
+             .insert(
+                 {key,
+                  PipelineLayoutCache::Value(
+                      device_, layout, push_constants_size)})
+             .first;
   }
 
   return it->second.handle();
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h
index 4f42a9bf6bb..5460a0acba7 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.h
+++ b/backends/vulkan/runtime/vk_api/Pipeline.h
@@ -121,7 +121,7 @@ VkImageLayout vk_layout(const PipelineStageFlags, const MemoryAccessFlags);
 
 class PipelineLayout final {
  public:
-  explicit PipelineLayout(VkDevice, VkDescriptorSetLayout);
+  explicit PipelineLayout(VkDevice, VkDescriptorSetLayout, const uint32_t);
 
   PipelineLayout(const PipelineLayout&) = delete;
   PipelineLayout& operator=(const PipelineLayout&) = delete;
@@ -193,13 +193,17 @@ class PipelineLayoutCache final {
   PipelineLayoutCache& operator=(PipelineLayoutCache&&) = delete;
 
   ~PipelineLayoutCache();
-
-  using Key = VkDescriptorSetLayout;
+  using Key = std::pair<VkDescriptorSetLayout, uint32_t>;
   using Value = PipelineLayout;
 
   struct Hasher {
-    inline size_t operator()(VkDescriptorSetLayout descriptor_layout) const {
-      return std::hash<VkDescriptorSetLayout>()(descriptor_layout);
+    inline size_t operator()(
+        std::pair<VkDescriptorSetLayout, uint32_t> key) const {
+      size_t seed = 0;
+      seed = utils::hash_combine(
+          seed, std::hash<VkDescriptorSetLayout>()(key.first));
+      seed = utils::hash_combine(seed, std::hash<uint32_t>()(key.second));
+      return seed;
     }
   };
 
@@ -212,7 +216,7 @@ class PipelineLayoutCache final {
   std::unordered_map<Key, Value, Hasher> cache_;
 
  public:
-  VkPipelineLayout retrieve(const Key&);
+  VkPipelineLayout retrieve(const VkDescriptorSetLayout, const uint32_t);
   void purge();
 };
 

From f1c5e513ecf75b832285660779661d0935a3982e Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 2 Jan 2025 13:05:33 -0800
Subject: [PATCH 3/3] [ET-VK] Fix metadata UBO VVL warnings

## Context

Recently https://github.com/pytorch/executorch/pull/7015 was implemented so that all tensor metadata (e.g. sizes, strides) would be stored in a single UBO instead of with separate UBO objects. This helps with memory savings presumably due to defragmentation of memory allocations.

However, once the change was introduced, I noticed two new warnings produced by the Vulkan Validation Layer.

The first complains that the offset of a UBO descriptor is not a multiple of the `minUniformBufferOffsetAlignment` field reported by the physical device properties.

The second complains that the range of a UBO descriptor exceeds the offset + range of the underlying UBO object.

# Solution

To address the first one, instead of using `sizeof(utils::ivec4)` to determine the offset per metadata field, check the `minUniformBufferOffsetAlignment` field of reported by the device and use that instead.

The second warning arises because the logic in the constructor of `BufferBindInfo` had a mistake; instead of using the range of the underlying UBO object, it should use the range subtracted by the user specified offset.

Differential Revision: [D67770792](https://our.internmc.facebook.com/intern/diff/D67770792/)

ghstack-source-id: 260031110
Pull Request resolved: https://github.com/pytorch/executorch/pull/7480
---
 .../vulkan/runtime/api/containers/Tensor.cpp  | 43 ++++++++++++-------
 .../vulkan/runtime/api/containers/Tensor.h    | 17 +++-----
 backends/vulkan/runtime/vk_api/Adapter.h      |  4 ++
 backends/vulkan/runtime/vk_api/Descriptor.cpp | 12 +++++-
 backends/vulkan/runtime/vk_api/Descriptor.h   |  4 ++
 backends/vulkan/runtime/vk_api/Device.cpp     | 11 ++++-
 backends/vulkan/runtime/vk_api/Device.h       |  1 +
 7 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 92e310d36de..900854ccd75 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -658,66 +658,77 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
 }
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
+  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
+    uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
   }
   if (sizes_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
+        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     sizes_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += kSizePerUniform;
+    uniforms_size_ += size_per_ubo;
     uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
   }
-  return vkapi::BufferBindInfo(uniforms_.buffer(), sizes_uniform_offset_);
+  return vkapi::BufferBindInfo(
+      uniforms_.buffer(), sizes_uniform_offset_, size_per_ubo);
 }
 
 const vkapi::BufferBindInfo vTensor::strides_ubo() {
+  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
+    uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
   }
   if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
+        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     unsqueezed_strides_offset_ = uniforms_size_;
-    uniforms_size_ += kSizePerUniform;
+    uniforms_size_ += size_per_ubo;
     uniforms_.update(
         utils::make_whcn_ivec4(unsqueezed_strides_),
         unsqueezed_strides_offset_);
   }
-  return vkapi::BufferBindInfo(uniforms_.buffer(), unsqueezed_strides_offset_);
+  return vkapi::BufferBindInfo(
+      uniforms_.buffer(), unsqueezed_strides_offset_, size_per_ubo);
 }
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
+  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
+    uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
   }
   if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
+        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     logical_limits_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += kSizePerUniform;
+    uniforms_size_ += size_per_ubo;
     uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), logical_limits_uniform_offset_);
+      uniforms_.buffer(), logical_limits_uniform_offset_, size_per_ubo);
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
+  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_.context_, kMaxUniformBufferSize, true);
+    uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
   }
   if (numel_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + kSizePerUniform) <= kMaxUniformBufferSize,
+        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     numel_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += kSizePerUniform;
+    uniforms_size_ += size_per_ubo;
     uniforms_.update(numel(), numel_uniform_offset_);
   }
-  return vkapi::BufferBindInfo(uniforms_.buffer(), numel_uniform_offset_);
+  return vkapi::BufferBindInfo(
+      uniforms_.buffer(), numel_uniform_offset_, size_per_ubo);
 }
 
 size_t vTensor::staging_buffer_numel() const {
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 3e51be6f948..49d5fcd36a3 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -348,16 +348,13 @@ class vTensor final {
   uint32_t numel_uniform_offset_;
   uint32_t logical_limits_uniform_offset_;
 
-  // Size allocated for each uniform
-  // each uniform is assumed to be a vec of 4 ints to maintain 16 byte alignemnt
-  constexpr static size_t kSizePerUniform = sizeof(utils::ivec4);
-  // Total size of tensor's uniform buffer
-  constexpr static size_t kMaxUniformBufferSize =
-      4 * // we have 4 uniforms that are passed on to shaders
-      kSizePerUniform;
-
-  // Initial value of uniform buffer offsets
-  constexpr static uint32_t kUniformOffsetUnset = kMaxUniformBufferSize;
+  // Maximum number of metadata fields that can be stored in the metadata UBO.
+  // This is used to calculate the size of the UBO that should be allocated.
+  constexpr static size_t kMaxMetadataFieldCount = 4;
+
+  // Initial value of uniform buffer offsets. 1 is selected as it is essentially
+  // impossible for a ubo to have an offset of 1.
+  constexpr static uint32_t kUniformOffsetUnset = 1;
 
   vTensorStorage storage_;
 
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
index 0deea81a7f3..be0554161d3 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.h
+++ b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -207,6 +207,10 @@ class Adapter final {
     return supports_8bit_storage_buffers() && supports_int8_shader_types();
   }
 
+  inline size_t min_ubo_alignment() const {
+    return physical_device_.min_ubo_alignment;
+  }
+
   // Command Buffer Submission
 
   void
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
index 956711bccc0..b42ade6ea02 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.cpp
+++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp
@@ -28,7 +28,17 @@ BufferBindInfo::BufferBindInfo(
     const uint32_t offset_p)
     : handle(buffer_p.handle()),
       offset(buffer_p.mem_offset() + offset_p),
-      range(buffer_p.mem_range()) {}
+      range(buffer_p.mem_range() - offset_p) {}
+
+BufferBindInfo::BufferBindInfo(
+    const VulkanBuffer& buffer_p,
+    const uint32_t offset_p,
+    const uint32_t range_p)
+    : handle(buffer_p.handle()),
+      offset(buffer_p.mem_offset() + offset_p),
+      range(range_p) {
+  VK_CHECK_COND(range_p <= (buffer_p.mem_range() - offset_p));
+}
 
 //
 // ParamsBindList
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h
index 38401f2243d..60d66a22619 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.h
+++ b/backends/vulkan/runtime/vk_api/Descriptor.h
@@ -34,6 +34,10 @@ struct BufferBindInfo final {
 
   BufferBindInfo();
   BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u);
+  BufferBindInfo(
+      const VulkanBuffer& buffer_p,
+      const uint32_t offset_p,
+      const uint32_t range_p);
 };
 
 struct ParamsBindList final {
diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp
index 21769c6a70e..c4119e04b78 100644
--- a/backends/vulkan/runtime/vk_api/Device.cpp
+++ b/backends/vulkan/runtime/vk_api/Device.cpp
@@ -39,10 +39,17 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
       num_compute_queues(0),
       supports_int16_shader_types(false),
       has_unified_memory(false),
-      has_timestamps(properties.limits.timestampComputeAndGraphics),
-      timestamp_period(properties.limits.timestampPeriod) {
+      has_timestamps(false),
+      timestamp_period(0),
+      min_ubo_alignment(0) {
   // Extract physical device properties
   vkGetPhysicalDeviceProperties(handle, &properties);
+
+  // Extract fields of interest
+  has_timestamps = properties.limits.timestampComputeAndGraphics;
+  timestamp_period = properties.limits.timestampPeriod;
+  min_ubo_alignment = properties.limits.minUniformBufferOffsetAlignment;
+
   vkGetPhysicalDeviceMemoryProperties(handle, &memory_properties);
 
   VkPhysicalDeviceFeatures2 features2{
diff --git a/backends/vulkan/runtime/vk_api/Device.h b/backends/vulkan/runtime/vk_api/Device.h
index d883cfb7041..70d5b1db5af 100644
--- a/backends/vulkan/runtime/vk_api/Device.h
+++ b/backends/vulkan/runtime/vk_api/Device.h
@@ -49,6 +49,7 @@ struct PhysicalDevice final {
   bool has_unified_memory;
   bool has_timestamps;
   float timestamp_period;
+  size_t min_ubo_alignment;
 
   explicit PhysicalDevice(VkPhysicalDevice);
 };