From e530eec6dedb05d2c7fca797f27407bb7ae535db Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 22 Apr 2024 13:39:25 -0700
Subject: [PATCH] [ET-VK] Bring back `extents_ubo()` as `texture_limits_ubo()`

## Context

https://github.com/pytorch/executorch/pull/3181 deprecated the `gpu_sizes_ubo()` and `extents_ubo()` functions of `vTensor` in order to standardize how shaders consume shape/size metadata of input tensors. However, this came at the cost of increasing the overhead required for bounds checking, which is needed to support dynamic shapes as shaders now needed to convert the input sizes to texture limits before checking if a given texel position is out of bounds.

Benchmarking revealed that this overhead can be quite significant especially on lower power mobile GPUs. In the interest of preserving performance, `extents_ubo()` is re-introduced since bounds checking is an operation that is common to every single shader. However, some improvements are made:

* instead of `extents`, the nomenclature `texture_limits` is used in order to differentiate from physical image extents of the texture.
* `texture_limits` is represented via an `ivec3` (previously `uvec4`); this means that to use it for bounds checking, there does not need to be an implicit cast to from `uvec` to `ivec` and there is also no need for swizzling.

Also introduced in this changeset is the convention of passing both the texture limits and tensor sizes instead of using `pos_out_of_bounds()`. Passing in the texture limits is probably cheaper than using `pos_out_of_bounds()`. There are some exceptions though where I choose not to migrate to this pattern to avoid passing in too many variants of tensor metadata.

### What about `gpu_sizes_ubo`?

I will hold off on re-introducing `gpu_sizes_ubo` for now since converting `sizes` to `gpu_sizes` is much cheaper compared to `pos_out_of_bounds()`:

```
ivec4 sizes[packed_dim] = alignup4(sizes[packed_dim])
```

Will perform some additional benchmarking on this to see if the overhead of the alignment warrants an explicit API for passing in GPU sizes to shaders.

Differential Revision: [D56435574](https://our.internmc.facebook.com/intern/diff/D56435574/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/api/Tensor.cpp        | 51 +++++++++++++++++--
 backends/vulkan/runtime/api/Tensor.h          | 32 +++++++++---
 .../vulkan/runtime/graph/ops/glsl/conv2d.glsl |  8 ++-
 .../runtime/graph/ops/glsl/conv2d_dw.glsl     |  8 ++-
 .../graph/ops/glsl/conv2d_dw_output_tile.glsl |  8 ++-
 .../runtime/graph/ops/glsl/conv2d_pw.glsl     | 10 ++--
 .../graph/ops/glsl/conv_transpose2d.glsl      |  8 +--
 .../vulkan/runtime/graph/ops/glsl/matmul.glsl |  8 ++-
 .../runtime/graph/ops/glsl/max_pool2d.glsl    |  8 ++-
 .../graph/ops/glsl/native_layer_norm.glsl     | 12 +++--
 .../runtime/graph/ops/glsl/permute.glsl       | 18 +++----
 .../graph/ops/glsl/select_batch_4d.glsl       | 12 +++--
 .../graph/ops/glsl/select_channel_3d.glsl     | 12 +++--
 .../graph/ops/glsl/select_channel_4d.glsl     | 12 +++--
 .../graph/ops/glsl/select_height_3d.glsl      | 12 +++--
 .../graph/ops/glsl/select_height_4d.glsl      | 12 +++--
 .../graph/ops/glsl/select_width_3d.glsl       | 12 +++--
 .../graph/ops/glsl/select_width_4d.glsl       | 12 +++--
 .../runtime/graph/ops/glsl/slice_channel.glsl |  1 -
 .../runtime/graph/ops/glsl/sum_dim.glsl       |  6 +--
 .../graph/ops/glsl/sum_dim_keepdim.glsl       |  6 +--
 .../runtime/graph/ops/glsl/unary_op.glsl      |  8 ++-
 .../runtime/graph/ops/impl/Convolution.cpp    |  4 +-
 .../vulkan/runtime/graph/ops/impl/MatMul.cpp  |  4 +-
 .../graph/ops/impl/NativeLayerNorm.cpp        |  6 ++-
 .../vulkan/runtime/graph/ops/impl/Permute.cpp |  6 ++-
 .../vulkan/runtime/graph/ops/impl/Pool.cpp    |  4 +-
 .../vulkan/runtime/graph/ops/impl/Select.cpp  |  5 +-
 .../vulkan/runtime/graph/ops/impl/Sum.cpp     |  4 +-
 .../vulkan/runtime/graph/ops/impl/UnaryOp.cpp |  4 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   | 24 ++++-----
 31 files changed, 202 insertions(+), 135 deletions(-)

diff --git a/backends/vulkan/runtime/api/Tensor.cpp b/backends/vulkan/runtime/api/Tensor.cpp
index 6cbba048528..402d35d75bb 100644
--- a/backends/vulkan/runtime/api/Tensor.cpp
+++ b/backends/vulkan/runtime/api/Tensor.cpp
@@ -139,8 +139,10 @@ vTensor::vTensor(
       // Calculate sizes and strides
       sizes_(sizes.begin(), sizes.end()),
       gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
-      // Utility Uniform Buffer that can be passed to shaders as arguments
-      sizes_uniform_(context, api::utils::make_whcn_ivec4(sizes_)),
+      texture_limits_{{0, 0, 0}},
+      // Utility Uniform Buffers that can be passed to shaders as arguments
+      sizes_uniform_(),
+      texture_limits_uniform_(),
       // Construct Tensor storage
       storage_(
           context,
@@ -149,6 +151,13 @@ vTensor::vTensor(
           gpu_sizes_,
           dtype_,
           allocate_memory) {
+  if (storage_type != api::kBuffer) {
+    texture_limits_.limits = api::utils::ivec3{
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[0]),
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[1]),
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[2])};
+  }
+
   if (dtype == api::kHalf) {
     VK_CHECK_COND(
         api::context()->adapter_ptr()->has_16bit_storage(),
@@ -187,6 +196,22 @@ api::VulkanBuffer& vTensor::buffer(
   return storage_.buffer_;
 }
 
+const api::BufferBindInfo vTensor::sizes_ubo() {
+  if (!sizes_uniform_.buffer()) {
+    sizes_uniform_ = api::UniformParamsBuffer(
+        storage_.context_, api::utils::make_whcn_ivec4(sizes_));
+  }
+  return api::BufferBindInfo(sizes_uniform_.buffer());
+}
+
+const api::BufferBindInfo vTensor::texture_limits_ubo() {
+  if (!texture_limits_uniform_.buffer()) {
+    texture_limits_uniform_ =
+        api::UniformParamsBuffer(storage_.context_, texture_limits_);
+  }
+  return api::BufferBindInfo(texture_limits_uniform_.buffer());
+}
+
 VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
   switch (storage_type()) {
     case api::kBuffer:
@@ -224,7 +249,25 @@ void vTensor::bind_allocation(const api::MemoryAllocation& allocation) {
 void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   sizes_ = new_sizes;
   gpu_sizes_ = calc_gpu_sizes(sizes_, memory_layout_, storage_type());
-  sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
+
+  if (storage_type() != api::kBuffer) {
+    // Calculate the extents of the image texture that would have been required
+    // for a tensor of the new sizes.
+    api::utils::uvec3 virtual_extents =
+        create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
+    // Update the texture limits to reflect the new virtual extents.
+    texture_limits_.limits = api::utils::ivec3{
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[0]),
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[1]),
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[2])};
+  }
+
+  if (sizes_uniform_.buffer()) {
+    sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
+  }
+  if (texture_limits_uniform_.buffer()) {
+    texture_limits_uniform_.update(texture_limits_);
+  }
 }
 
 void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
@@ -236,6 +279,8 @@ void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
 }
 
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+  // For texture storage check that the current texture is large enough for the
+  // new sizes of the tensor.
   if (storage_type() != api::kBuffer) {
     api::utils::uvec3 virtual_extents =
         create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
diff --git a/backends/vulkan/runtime/api/Tensor.h b/backends/vulkan/runtime/api/Tensor.h
index 8ba99ed1827..53dbfecffe6 100644
--- a/backends/vulkan/runtime/api/Tensor.h
+++ b/backends/vulkan/runtime/api/Tensor.h
@@ -94,6 +94,13 @@ class vTensorStorage final {
 };
 
 class vTensor final {
+  struct TextureLimits {
+    // Alignment is required to conform with Vulkan specification; a 3 or 4
+    // component vector with components of size N must have base alignment of
+    // 4N.
+    alignas(16) api::utils::ivec3 limits;
+  };
+
  public:
   explicit vTensor(
       api::Context* context,
@@ -115,11 +122,18 @@ class vTensor final {
 
   std::vector<int64_t> sizes_;
   std::vector<int64_t> gpu_sizes_;
+  TextureLimits texture_limits_;
 
-  // A Vulkan uniform buffer containing the tensor sizes in WHCN that can be
-  // passed into a shader.
+  // A Vulkan uniform buffer containing the (W, H, C, N) tensor sizes that can
+  // be passed into a shader.
   api::UniformParamsBuffer sizes_uniform_;
 
+  // A Vulkan uniform buffer containing the texture limits derived from the
+  // tensor's current size information that can be passed into a shader. Note
+  // that the texture limits may be different from the texture's extents if the
+  // tensor has been resized with `virtual_resize()`.
+  api::UniformParamsBuffer texture_limits_uniform_;
+
   vTensorStorage storage_;
 
  public:
@@ -194,11 +208,17 @@ class vTensor final {
 
   /*
    * Get the binding information for the uniform buffer object containing the
-   * tensor sizes to use in a compute shader.
+   * tensor sizes to use in a compute shader. Note that the GPU buffer will be
+   * allocated the first time this function is called.
    */
-  inline const api::BufferBindInfo sizes_ubo() {
-    return api::BufferBindInfo(sizes_uniform_.buffer());
-  }
+  const api::BufferBindInfo sizes_ubo();
+
+  /*
+   * Get the binding information for the uniform buffer object containing the
+   * texture limits to use in a compute shader. Note that the GPU buffer will be
+   * allocated the first time this function is called.
+   */
+  const api::BufferBindInfo texture_limits_ubo();
 
   inline size_t numel() const {
     return api::utils::multiply_integers(sizes());
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
index 578c195ea9d..33f5ff9dd3e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,8 +44,6 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a 2D convolution. Each shader invocation calculates the output at
  * a single output location.
@@ -53,7 +51,7 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
index fa6dee4760f..56d70a2bfe0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,8 +44,6 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
@@ -53,7 +51,7 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index 207eab0a9c6..cf4cfe66ac2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,8 +44,6 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
@@ -53,7 +51,7 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index bb780ad2886..453a03dea54 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,8 +44,6 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
  * output tile for pointwise convolution is more efficient because the kernel
@@ -71,7 +69,7 @@ void main() {
 
   // If the top left position is out of bounds, then this invocation will have
   // no work to do.
-  if (pos_out_of_bounds(pos[0], out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos[0], out_limits))) {
     return;
   }
 
@@ -146,7 +144,7 @@ void main() {
   }
 
   for (int i = 0; i < ${TILE_SIZE * TILE_SIZE}; ++i) {
-    if (!pos_out_of_bounds(pos[i], out_sizes, packed_dim)) {
+    if (all(lessThan(pos[i], out_limits))) {
       imageStore(image_out, pos[i], sum[i]);
     }
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
index 4a141ddded9..3f2f6241a1d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
@@ -21,11 +21,11 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
-layout(set = 0, binding = 5) uniform PRECISION restrict InExtents {
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
   ivec4 in_sizes;
 };
 
@@ -54,7 +54,7 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl
index 9cd0c63ac88..a911c4fb6e4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl
@@ -16,8 +16,8 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
 layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
 layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
 
-layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
@@ -26,12 +26,10 @@ layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int out_packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, out_packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
index ccac87b3864..25749afbf85 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
@@ -19,8 +19,8 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
 layout(set = 0, binding = 1, ${IMAGE_FORMAT["int"]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM]["int"]} image_idx;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
@@ -36,12 +36,10 @@ layout(set = 0, binding = 5) uniform PRECISION restrict Params {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
index 32bf2df0e93..235408c0a81 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
@@ -25,22 +25,24 @@ layout(set = 0, binding = 3) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 4) uniform PRECISION sampler3D weight_in;
 layout(set = 0, binding = 5) uniform PRECISION sampler3D bias_in;
 
-layout(set = 0, binding = 6) uniform PRECISION restrict Sizes {
+layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 7) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-layout(set = 0, binding = 7) uniform PRECISION restrict Epsilon {
+layout(set = 0, binding = 8) uniform PRECISION restrict Epsilon {
   float epsilon;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
index 4ba2c7f4c60..ff5ab63a4f7 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
@@ -19,31 +19,27 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-/*
- * Params Buffer
- */
-layout(set = 0, binding = 3) uniform PRECISION restrict Block {
+layout(set = 0, binding = 4) uniform PRECISION restrict Block {
   // output dims
   uvec4 out_ndims;
   // x = output channels aligned to 4, y = input channels aligned to 4
   uvec2 ch_info;
 };
 
-/*
- * Local Work Group
- */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
index f6135d138c2..f94e1120492 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
@@ -17,11 +17,15 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-layout(set = 0, binding = 3) uniform PRECISION restrict SelectVal {
+layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
   // data.x: index along batch dim to select
   // data.y: number of batches
   // data.z: number of texels per batch
@@ -31,8 +35,6 @@ layout(set = 0, binding = 3) uniform PRECISION restrict SelectVal {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const int num_batches = select_info.y;
   const int num_texel_per_batch = select_info.z;
@@ -40,7 +42,7 @@ void main() {
 
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
index b86b15e8614..0bbec798484 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
@@ -20,23 +20,25 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
 // index to select
-layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
   int index;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
index b3ff196682e..517362f76ea 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
@@ -18,11 +18,15 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-layout(set = 0, binding = 3) uniform PRECISION restrict SelectVal {
+layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
   // data.x: index along channel dim to select
   // data.y: number of batches
   // data.z: number of texels per batch
@@ -32,12 +36,10 @@ layout(set = 0, binding = 3) uniform PRECISION restrict SelectVal {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
index b71efd7d50b..87409fb35fd 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
@@ -18,23 +18,25 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
 // index to select
-layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
   int index;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
index e78b692ecb3..2e4e2afb2db 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
@@ -18,12 +18,16 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
 // index to select
-layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
   // data.x: index along height dim to select
   // data.y: number of batches
   // data.z: number of texels per batch
@@ -33,12 +37,10 @@ layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
index 56d71f58d02..1e12d15ab21 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
@@ -19,23 +19,25 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
 // index to select
-layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
   int index;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
index 3e09e329b31..ffbd8afbda0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
@@ -19,12 +19,16 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
 // index to select
-layout(set = 0, binding = 3) uniform PRECISION restrict SelectVal {
+layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
   // data.x: index along width dim to select
   // data.y: number of batches
   // data.z: number of texels per batch
@@ -34,12 +38,10 @@ layout(set = 0, binding = 3) uniform PRECISION restrict SelectVal {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
index cfe264b5491..607f77d8254 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
@@ -39,7 +39,6 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void main() {
   const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-
   const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim);
 
   if (any(greaterThanEqual(idx, out_sizes))) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl
index 3e7cb25be5a..03cd94fb3d7 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl
@@ -18,8 +18,8 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 // dim to sum
@@ -54,7 +54,7 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl
index b7ebd353b57..64d37a13e8f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/sum_dim_keepdim.glsl
@@ -17,8 +17,8 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 // dim to sum
@@ -48,7 +48,7 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
index fda2a08188a..85e2c5c1a5e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
@@ -21,8 +21,8 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 3) uniform PRECISION restrict Min {
@@ -35,12 +35,10 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Max {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 2ad1880667c..20d7c9256bb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -372,13 +372,13 @@ void add_conv2d_node(
        {{arg_in, arg_weight, arg_bias}, api::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          t_out->sizes_ubo(),
+          t_out->texture_limits_ubo(),
           t_in->sizes_ubo(),
           graph.create_params_buffer(kernel_params),
           graph.create_params_buffer(extra_params),
       },
       // Specialization Constants
-      {t_out->gpu_memory_layout_int()},
+      {},
       // Resizing Logic
       resize_conv2d_node,
       {weight, stride, padding, dilation, transposed, output_padding}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index 4ac6e148274..053ef0ff350 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -93,9 +93,9 @@ void add_matmul_node(
       {{out, api::MemoryAccessType::WRITE},
        {{arg1, arg2}, api::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->sizes_ubo(), t_mat1->sizes_ubo()},
+      {t_out->texture_limits_ubo(), t_mat1->sizes_ubo()},
       // Specialization Constants
-      {t_out->gpu_memory_layout_int()},
+      {},
       // Resizing Logic
       resize_matmul_node));
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
index 1f34b0344e8..0c579274448 100644
--- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
@@ -109,9 +109,11 @@ void add_native_layer_norm_node(
         api::MemoryAccessType::WRITE},
        {{arg_in, arg_weight, arg_bias}, api::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->sizes_ubo(), graph.create_params_buffer(epsilon)},
+      {t_out->texture_limits_ubo(),
+       t_out->sizes_ubo(),
+       graph.create_params_buffer(epsilon)},
       // Specialization Constants
-      {SV(t_out->gpu_memory_layout_int())},
+      {},
       // Resizing Logic
       resize_native_layer_norm_node,
       {normalized_shape}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index 09e5cc906e9..3bc6ca52c60 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -88,9 +88,11 @@ void add_permute_node(
       global_size,
       local_size,
       {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
-      {t_out->sizes_ubo(), graph.create_params_buffer(params)},
+      {t_out->texture_limits_ubo(),
+       t_out->sizes_ubo(),
+       graph.create_params_buffer(params)},
       // Specialization Constants
-      {SV(t_out->gpu_memory_layout_int())},
+      {},
       // Resizing Logic
       nullptr,
       {}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index 58557788138..1a8a258627e 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -94,12 +94,12 @@ void add_max_pool2d_node(
        {arg, api::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          t_out->sizes_ubo(),
+          t_out->texture_limits_ubo(),
           t_in->sizes_ubo(),
           graph.create_params_buffer(kernel_params),
       },
       // Specialization Constants
-      {t_out->gpu_memory_layout_int()},
+      {},
       // Resizing Logic
       resize_max_pool2d_node,
       {kernel_size, stride, padding, dilation, ceil_mode}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
index 073eae77ce4..1d85984ef18 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
@@ -114,14 +114,15 @@ void add_select_int_node(
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
       // Parameter buffers
-      {t_out->sizes_ubo(),
+      {t_out->texture_limits_ubo(),
+       t_out->sizes_ubo(),
        // TODO: num_batches and num_texel_per_batch are provided by
        // t_out->sizes. Can change the following to reduce params
        // created.
        graph.create_params_buffer(api::utils::make_ivec4(
            {index, num_batches, num_texel_per_batch, 0}))},
       // Specialization Constants
-      {SV(t_out->gpu_memory_layout_int())}));
+      {}));
 }
 
 void select_int(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
index 652340d1dc6..cf7f891cdcb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
@@ -87,12 +87,12 @@ void add_sum_dim_node(
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}, {arg, api::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->sizes_ubo(),
+      {t_out->texture_limits_ubo(),
        graph.create_params_buffer(dim + 4 - in_dim),
        graph.create_params_buffer(dim_size),
        graph.create_params_buffer(int(ceil(channel / 4.0)))},
       // Specialization Constants
-      {t_out->gpu_memory_layout_int()},
+      {},
       // Resizing Logic
       resize_sum_node,
       {out, in, static_cast<int>(dim), keepdim}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index 0d28f52e1c2..3888118b90d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -55,11 +55,11 @@ void add_unary_op_node(
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}, {arg, api::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->sizes_ubo(),
+      {t_out->texture_limits_ubo(),
        graph.create_params_buffer(min),
        graph.create_params_buffer(max)},
       // Specialization Constants
-      {t_out->gpu_memory_layout_int()},
+      {},
       // Resizing Logic
       resize_unary_op_node));
 }
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index aecc27d966f..4955d0537ee 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -287,8 +287,8 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
   vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
 
-  // Allocations will be made for uniform buffers containing tensor metadata
-  EXPECT_TRUE(get_vma_allocation_count() == 3);
+  // No allocations made so far
+  EXPECT_TRUE(get_vma_allocation_count() == 0);
 
   std::vector<float> data_a(a.gpu_numel());
   std::fill(data_a.begin(), data_a.end(), 2.5f);
@@ -303,8 +303,8 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   api::MemoryAllocation c_mem = allocate_memory_for(c);
   c.image().bind_allocation(c_mem);
 
-  // One additional allocation for each tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 6);
+  // One allocation for each tensor
+  EXPECT_TRUE(get_vma_allocation_count() == 3);
 
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
@@ -332,8 +332,8 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   vTensor d = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
   vTensor e = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
 
-  // Allocations will be made for uniform buffers containing tensor metadata
-  EXPECT_TRUE(get_vma_allocation_count() == 5);
+  // No allocations made so far
+  EXPECT_TRUE(get_vma_allocation_count() == 0);
 
   // a and d can share the same memory allocation
   api::MemoryAllocation a_d_mem = allocate_memory_for(a);
@@ -347,8 +347,8 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   api::MemoryAllocation c_mem = allocate_memory_for(c);
   c.image().bind_allocation(c_mem);
 
-  // 3 additional allocations should be made
-  EXPECT_TRUE(get_vma_allocation_count() == 8);
+  // 3 allocations should be made
+  EXPECT_TRUE(get_vma_allocation_count() == 3);
 
   // Specify input data
   std::vector<float> data_a(a.gpu_numel());
@@ -407,12 +407,12 @@ TEST_F(VulkanComputeAPITest, resource_destructor_non_owning_memory) {
     vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
 
     memory = allocate_memory_for(a);
-    EXPECT_TRUE(get_vma_allocation_count() == 2);
+    EXPECT_TRUE(get_vma_allocation_count() == 1);
     a.image().bind_allocation(memory);
   }
 
   // Check that the memory is still allocated
-  EXPECT_TRUE(get_vma_allocation_count() == 2);
+  EXPECT_TRUE(get_vma_allocation_count() == 1);
 }
 
 TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
@@ -421,8 +421,8 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
   std::vector<int64_t> sizes = {4, 4, 1};
   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
 
-  // Allocation for uniform containing tensor metadata
-  EXPECT_TRUE(get_vma_allocation_count() == 1);
+  // No allocations yet
+  EXPECT_TRUE(get_vma_allocation_count() == 0);
 
   std::vector<float> data_a(a.gpu_numel());
   std::fill(data_a.begin(), data_a.end(), 2.5f);