From 337c8e988574ef04e552f71609851df351229be7 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:26:13 -0800
Subject: [PATCH] [ET-VK] Using push constants for conv2d pw.

This diff is related to the use of push constants for convolutional pw (pointwise) in Executorch's Vulkan backend. This optimization improves performance and memory usage.

Differential Revision: [D68400677](https://our.internmc.facebook.com/intern/diff/D68400677/)

[ghstack-poisoned]
---
 .../runtime/graph/ops/glsl/conv2d_pw.glsl     | 23 +++--
 .../runtime/graph/ops/impl/Convolution.cpp    | 83 ++++++++++++++-----
 2 files changed, 78 insertions(+), 28 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index f72c487fa78..0413eb7b7aa 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -24,11 +24,20 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
 ${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
 ${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
 ${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-${layout_declare_ubo(4, "ivec3", "out_limits")}
-${layout_declare_ubo(5, "ivec4", "in_sizes")}
-${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
-${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_limits;
+  ivec4 in_sizes;
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+  ivec2 overlay_region;
+  int in_group_size;
+  int dummy_padding;
+  float out_min;
+  float out_max;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -70,7 +79,7 @@ void main() {
 
   // If the top left position is out of bounds, then this invocation will have
   // no work to do.
-  if (any(greaterThanEqual(ivec3(pos[0], gpos.z), out_limits))) {
+  if (any(greaterThanEqual(ivec3(pos[0], gpos.z), out_limits.xyz))) {
     return;
   }
 
@@ -144,7 +153,7 @@ void main() {
 
   for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
     const ivec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
-    if (all(lessThan(ivec3(pos, gpos.z), out_limits))) {
+    if (all(lessThan(ivec3(pos, gpos.z), out_limits.xyz))) {
       imageStore(t_out, ivec3(pos, gpos.z), op(sum[i], out_min, out_max));
     }
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 8c369914c1b..3c367f334d9 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -407,27 +407,68 @@ void add_conv2d_node(
     wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
   }
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      shader,
-      wg_size,
-      graph.create_local_wg_size(wg_size),
-      // Inputs and Outputs
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
-      // Shader params buffers
-      {
-          t_out->logical_limits_ubo(),
-          t_in->sizes_ubo(),
-          graph.create_params_buffer(kernel_params),
-          graph.create_params_buffer(extra_params),
-          graph.create_params_buffer(out_params),
-      },
-      // Specialization Constants
-      {},
-      // Resizing Logic
-      resize_conv2d_node,
-      {weight_data, stride, padding, dilation, transposed, output_padding}));
+  if (method == Conv2dMethod::Pointwise) {
+    const utils::ivec4 kernel_param_size_stride = {
+        kernel_params.kernel_size[0],
+        kernel_params.kernel_size[1],
+        kernel_params.stride[0],
+        kernel_params.stride[1]};
+
+    const utils::ivec4 kernel_param_pad_dial = {
+        kernel_params.padding[0],
+        kernel_params.padding[1],
+        kernel_params.dilation[0],
+        kernel_params.dilation[1]};
+
+    graph.execute_nodes().emplace_back(new DispatchNode(
+        graph,
+        shader,
+        wg_size,
+        graph.create_local_wg_size(wg_size),
+        // Inputs and Outputs
+        {{out, vkapi::MemoryAccessType::WRITE},
+         {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
+        // Shader params buffers
+        {},
+        // Specialization Constants
+        {},
+        // Resizing Logic
+        resize_conv2d_node,
+        {weight_data, stride, padding, dilation, transposed, output_padding},
+        {
+            graph.logical_limits_pc_of(out),
+            graph.sizes_pc_of(in),
+            PushConstantDataInfo(
+                &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
+            PushConstantDataInfo(
+                &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
+            PushConstantDataInfo(
+                &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
+            PushConstantDataInfo(&out_params, sizeof(out_params)),
+        }));
+  } else {
+    graph.execute_nodes().emplace_back(new DispatchNode(
+        graph,
+        shader,
+        wg_size,
+        graph.create_local_wg_size(wg_size),
+        // Inputs and Outputs
+        {{out, vkapi::MemoryAccessType::WRITE},
+         {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
+        // Shader params buffers
+        {
+            t_out->logical_limits_ubo(),
+            t_in->sizes_ubo(),
+            graph.create_params_buffer(kernel_params),
+            graph.create_params_buffer(extra_params),
+            graph.create_params_buffer(out_params),
+        },
+        // Specialization Constants
+        {},
+        // Resizing Logic
+        resize_conv2d_node,
+        {weight_data, stride, padding, dilation, transposed, output_padding}));
+  }
 }
 
 void add_conv1d_node(