diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
new file mode 100644
index 00000000000..78e698fa7e5
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D existing_out;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict CopyArgs {
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+  // Analogus to range variable in copy. It defines the # of channel being
+  // copied.
+  int channel_range;  
+  int src_channel_offset;
+  int dst_channel_offset;
+  int unused; 
+  // Operates on (x, y, z) extents. 
+  ivec3 range;
+  int unused1;
+  ivec3 dst_offset;
+  int unused2;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  // Note: Unlike other shaders, the range is often not equal to the destination
+  // texture extent.
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(pos, range))) {
+    return;
+  }
+
+  const ivec3 out_pos = pos + dst_offset;
+
+  const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
+
+  // First read the existing values to make sure the boundary values stay.
+  VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0));
+
+  for (int i=0; i<4; i++) {
+    ivec4 in_whcn = out_whcn;
+
+    in_whcn.z = out_whcn.z - dst_channel_offset + i;
+
+    // Handle the partial update for begining of channel in an existing tensor.
+    // If the source channel index is below zero or exceeds the range, we skip
+    // updating the element to avoid overwriting existing data.
+    if ((in_whcn.z < 0) || (in_whcn.z >= channel_range)) {
+      continue;
+    }
+
+    // Readjust for the source offset.
+    in_whcn.z = in_whcn.z + src_channel_offset;
+    
+    ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
+    v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
+  }
+
+  imageStore(image_out, out_pos, v);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
new file mode 100644
index 00000000000..3887647ff83
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
@@ -0,0 +1,10 @@
+copy_channel_offset:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: copy_channel_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
index 17b3e06e61e..0d1d3420a52 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -10,26 +10,12 @@
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_type(DTYPE)}
-
 layout(std430) buffer;
 
-#include "indexing_utils.h"
-
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict InLimits {
-  ivec3 in_limits;
-};
-
-
-
-layout(set = 0, binding = 4) uniform PRECISION restrict CopyArgs {
+layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs {
   ivec3 range;
   int unused0;
   ivec3 src_offset;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
index 0a5e20e4f7c..d599a00c2eb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -8,38 +8,39 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
+using api::utils::ivec3;
+using api::utils::uvec3;
+
 void add_copy_offset_node(
     ComputeGraph& graph,
     const ValueRef in,
-    const api::utils::ivec3& range,
-    const api::utils::ivec3& src_offset,
-    const api::utils::ivec3& dst_offset,
+    const ivec3& range,
+    const ivec3& src_offset,
+    const ivec3& dst_offset,
     const ValueRef out) {
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
-
   std::string kernel_name = "copy_offset";
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  api::utils::uvec3 global_size = api::utils::make_uvec3(range);
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+  uvec3 global_size = api::utils::make_uvec3(range);
+  uvec3 local_size = adaptive_work_group_size(global_size);
 
   const struct Block final {
-    api::utils::ivec3 range;
+    ivec3 range;
     int32_t unused0;
-    api::utils::ivec3 src_offset;
+    ivec3 src_offset;
     int32_t unused1;
-    api::utils::ivec3 dst_offset;
+    ivec3 dst_offset;
     int32_t unused2;
   } offset_params{
       range,
@@ -58,13 +59,166 @@ void add_copy_offset_node(
       global_size,
       local_size,
       // Inputs and Outputs
-      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      {
+          {out, api::MemoryAccessType::WRITE},
+          {in, api::MemoryAccessType::READ},
+      },
       // Parameter buffers
-      {t_out->texture_limits_ubo(),
-       t_in->texture_limits_ubo(),
-       graph.create_params_buffer(offset_params)},
+      {graph.create_params_buffer(offset_params)},
       // Specialization Constants
       {}));
 }
 
+void add_copy_channel_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    int32_t channel_range,
+    int32_t src_channel_offset,
+    int32_t dst_channel_offset,
+    const ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  // Likely need to prepad these numbers.
+  std::vector<int64_t> in_sizes = t_in->sizes();
+  std::vector<int64_t> out_sizes = t_out->sizes();
+
+  VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
+
+  // NOTE: This function should be able to support 1d and 2d tensors when
+  // range=1, src_offset=dst_offset=1.
+  VK_CHECK_COND(t_in->dim() >= 3, "Src dim should be at least 3");
+  VK_CHECK_COND(t_out->dim() >= 3, "Dst dim should be at least 3");
+
+  VK_CHECK_COND(
+      dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
+      "Source channel plus range should be less than or equal to input tensor's channel size");
+  VK_CHECK_COND(
+      dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
+      "Source channel and range should be less than or equal to input tensor's channel size");
+
+  VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
+  VK_CHECK_COND(
+      src_channel_offset >= 0, "Src channel offset must be non-negative");
+  VK_CHECK_COND(
+      dst_channel_offset >= 0, "Dst channel offset must be non-negative");
+
+  std::string kernel_name = "copy_channel_offset";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  int32_t out_channels = dim_at<Dim4D::Channel>(out_sizes);
+
+  // Copy one batch at a time.
+  for (int batch_idx = 0; batch_idx < dim_at<Dim4D::Batch>(in_sizes);
+       batch_idx++) {
+    // Mapping the tensor NCHW coordinates into texture XYZ coordinates
+    int32_t dst_first_z = dst_channel_offset / 4;
+    int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;
+
+    // We copy the entire width and height dimension. For the channel dimension,
+    // we use the z-dimension of the global_size to specify the texture range.
+    // The shader combines the global invocation id and the dst_offset to get
+    // the actual coordinate.
+
+    ivec3 dst_offset{
+        0, 0, dst_first_z + batch_idx * api::utils::div_up(out_channels, 4)};
+
+    uvec3 global_size{
+        dim_at<Dim4D::Width>(in_sizes),
+        dim_at<Dim4D::Height>(in_sizes),
+        api::utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
+
+    uvec3 local_size = adaptive_work_group_size(global_size);
+
+    const struct Block final {
+      api::utils::ivec4 out_sizes;
+      api::utils::ivec4 in_sizes;
+      int32_t channel_range;
+      int32_t src_channel_offset;
+      int32_t dst_channel_offset;
+      int32_t unused;
+      ivec3 range;
+      int32_t unused1;
+      ivec3 dst_offset;
+      int32_t unused2;
+
+    } channel_offset_params{
+        api::utils::make_whcn_ivec4(out_sizes),
+        api::utils::make_whcn_ivec4(in_sizes),
+        channel_range,
+        src_channel_offset,
+        dst_channel_offset,
+        0,
+        api::utils::make_ivec3(global_size),
+        0,
+        dst_offset,
+        0,
+    };
+
+    auto shader = VK_KERNEL_FROM_STR(kernel_name);
+
+    graph.execute_nodes().emplace_back(new ExecuteNode(
+        graph,
+        VK_KERNEL_FROM_STR(kernel_name),
+        global_size,
+        local_size,
+        // Inputs and Outputs
+        {
+            {out, api::MemoryAccessType::WRITE},
+            {out, api::MemoryAccessType::READ},
+            {in, api::MemoryAccessType::READ},
+        },
+        // Parameter buffers
+        {graph.create_params_buffer(channel_offset_params)},
+        // Specialization Constants
+        {}));
+  }
+}
+
+void add_copy_offset_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef range_ref,
+    ValueRef src_offset_ref,
+    ValueRef dst_offset_ref,
+    ValueRef out) {
+  ivec3 range = api::utils::make_ivec3(*graph.get_int_list(range_ref));
+  ivec3 src_offset =
+      api::utils::make_ivec3(*graph.get_int_list(src_offset_ref));
+  ivec3 dst_offset =
+      api::utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
+
+  add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
+}
+
+void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
+}
+
+void copy_channel_offset(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  ValueRef in = args[0];
+  ValueRef channel_range_ref = args[1];
+  ValueRef src_channel_offset_ref = args[2];
+  ValueRef dst_channel_offset_ref = args[3];
+  ValueRef out = args[4];
+
+  auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
+  auto src_channel_offset =
+      graph.extract_scalar<int64_t>(src_channel_offset_ref);
+  auto dst_channel_offset =
+      graph.extract_scalar<int64_t>(dst_channel_offset_ref);
+
+  add_copy_channel_offset_node(
+      graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(etvk.copy_offset, copy_offset);
+  VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
index 6e0deb6b74e..60a58b2fa84 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.h
@@ -14,6 +14,13 @@
 
 namespace vkcompute {
 
+// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
+// texture extents specified by the range, src_offset, and dst_offset (all are
+// in texture coordinate (x, y, z) from the input image to the output image.
+//
+// It is possible to have input and output to point to the same image
+// object. But when the source range and destination range overlap, the behavior
+// is undefined.
 void add_copy_offset_node(
     ComputeGraph& graph,
     const ValueRef in,
@@ -22,4 +29,25 @@ void add_copy_offset_node(
     const api::utils::ivec3& dst_offset,
     const ValueRef out);
 
+// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
+// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).
+// The range and offset arguments are in the tensor coordinate. It assumes the
+// underlying texture is channel-packed.
+//
+// This function is specialized implementation for copying
+// channel packed values. The complication comes from when reading / writing the
+// channel dimension on indices that are not aligned to packing, we will need
+// be careful about the boundaries.
+//
+// It achieves the following:
+//   out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] =
+//       in [:, src_channel_offset:src_channel_offset + channel_range, :, :]
+void add_copy_channel_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    int32_t channel_range,
+    int32_t src_channel_offset,
+    int32_t dst_channel_offset,
+    const ValueRef out);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index a1f3b93dc3a..87f9d2a915d 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -12,6 +12,7 @@
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
@@ -153,6 +154,26 @@ check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) {
   }
 }
 
+inline int64_t get_buf_idx(
+    ComputeGraph& graph,
+    IOValueRef ref,
+    const std::vector<int64_t>& tensor_coor) {
+  vTensorPtr vten_ptr = graph.get_tensor(ref.value);
+
+  const std::vector<int64_t>& sizes = vten_ptr->sizes();
+
+  int64_t c = dim_at<Dim4D::Channel>(sizes);
+  int64_t h = dim_at<Dim4D::Height>(sizes);
+  int64_t w = dim_at<Dim4D::Width>(sizes);
+
+  int64_t ni = dim_at<Dim4D::Batch>(tensor_coor);
+  int64_t ci = dim_at<Dim4D::Channel>(tensor_coor);
+  int64_t hi = dim_at<Dim4D::Height>(tensor_coor);
+  int64_t wi = dim_at<Dim4D::Width>(tensor_coor);
+
+  return (ni * c * h * w + ci * h * w + hi * w + wi);
+}
+
 //
 // Context Management
 //
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 4955d0537ee..abd8b725b43 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -876,6 +876,274 @@ TEST(VulkanComputeGraphTest, test_large_graph) {
   }
 }
 
+TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  int64_t n = 6;
+  int64_t c = 12;
+  int64_t h = 4;
+  int64_t w = 8;
+  api::GPUMemoryLayout memory_layout =
+      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+
+  std::vector<int64_t> size = {n, c, h, w};
+
+  IOValueRef a = graph.add_input_tensor(size, api::kFloat, memory_layout);
+
+  IOValueRef out = {};
+  out.value = graph.add_tensor(size, api::kFloat, memory_layout);
+
+  // Notice that copy_node operates on in texture's x, y, z dimension. In the
+  // comment, we provide the cooresponding coordinate in nchw.
+
+  // src_offset is (n=0, c=4, h=1, w=1)
+  ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
+
+  // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
+  // Argument is {x, y, z}.
+  // x = 0 since w = 0
+  // y = 2 since h = 2
+  // z = c / 4 + 2 since
+  //   1. there c/4 planes per batch, n=1 means we are on the first batch;
+  //   2. +2 because c = 8, with channel packing it means two texels.
+  ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
+
+  // range is (n=1, c=8, h=2, w=4)
+  // Argument is {x, y, z}.
+  // x = 4 since w = 4
+  // y = 2 since h = 2
+  // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
+  // bit misleading here, since it gives the impression that we are copying the
+  // entire channel. However, remember when we copy, we are trying to
+  // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
+  // range must be non zero.
+  ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
+
+  auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
+  copyFn(
+      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  fill_vtensor(graph, a, 0.0f, /*iota = */ true);
+
+  graph.execute();
+
+  EXTRACT_TENSOR(out);
+  EXTRACT_TENSOR(a);
+
+  // We will examine the results in the dst_range
+  // The value in the cooresponding coordinate should match between the source
+  // and destination tensor. We loop thru the range, calculate both the src and
+  // dst index using the offsets, and compare the values in the extracted
+  // vector. They should match.
+  int n_idx = 0;
+  // at each nested loop, index range from dst_offset to dst_offset + range
+
+  for (int c_idx = 0; c_idx < 8; c_idx++) {
+    for (int h_idx = 0; h_idx < 2; h_idx++) {
+      for (int w_idx = 0; w_idx < 4; w_idx++) {
+        auto dst_idx =
+            get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
+        auto src_idx =
+            get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
+
+        EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
+      }
+    }
+  }
+}
+
+TEST(VulkanComputeGraphTest, test_etvk_copy_channel_offset_node) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  int64_t n = 2;
+  int64_t c = 12;
+  int64_t h = 4;
+  int64_t w = 8;
+  api::GPUMemoryLayout memory_layout =
+      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+
+  std::vector<int64_t> size = {n, c, h, w};
+
+  IOValueRef a = graph.add_input_tensor(size, api::kFloat, memory_layout);
+
+  IOValueRef out = {};
+  out.value = graph.add_tensor(size, api::kFloat, memory_layout);
+
+  int64_t src_offset = 2;
+  int64_t dst_offset = 3;
+  int64_t range = 7;
+
+  ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
+  ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
+  ValueRef range_ref = graph.add_scalar<int64_t>(range);
+
+  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
+  copyFn(
+      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  fill_vtensor(graph, a, 0.0f, true);
+
+  graph.execute();
+
+  EXTRACT_TENSOR(out);
+  EXTRACT_TENSOR(a);
+
+  for (int n_idx = 0; n_idx < n; n_idx++) {
+    for (int c_idx = 0; c_idx < range; c_idx++) {
+      for (int h_idx = 0; h_idx < h; h_idx++) {
+        for (int w_idx = 0; w_idx < w; w_idx++) {
+          auto src_idx =
+              get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
+          auto dst_idx = get_buf_idx(
+              graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
+          EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
+        }
+      }
+    }
+  }
+}
+
+TEST(
+    VulkanComputeGraphTest,
+    test_etvk_copy_channel_offset_node_clean_boundary) {
+  // Tricky part for channel copy is handling the boundary across multiple copy.
+  // For example, when we concat two [3, 1, 1] nchw-tensors along the channel
+  // dimension, due to channel packing, elements from different source texel
+  // will be packed into same destination texel at the boundaries.
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  int64_t n = 2;
+  int64_t c = 12;
+  int64_t h = 4;
+  int64_t w = 8;
+  api::GPUMemoryLayout memory_layout =
+      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+
+  std::vector<int64_t> size = {n, c, h, w};
+
+  IOValueRef zero = graph.add_input_tensor(size, api::kFloat, memory_layout);
+  IOValueRef a = graph.add_input_tensor(size, api::kFloat, memory_layout);
+  IOValueRef b = graph.add_input_tensor(size, api::kFloat, memory_layout);
+
+  IOValueRef out = {};
+  out.value = graph.add_tensor(size, api::kFloat, memory_layout);
+
+  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
+
+  // Make sure entire out tensor is zeroed. The zero tensor will be filled with
+  // zero later.
+  copyFn(
+      graph,
+      {zero.value,
+       graph.add_scalar<int64_t>(c),
+       graph.add_scalar<int64_t>(0),
+       graph.add_scalar<int64_t>(0),
+       out.value});
+
+  int64_t a_src_offset = 0;
+  int64_t a_dst_offset = 2;
+  int64_t a_range = 5;
+  // a will write to channge [2, 7)
+  copyFn(
+      graph,
+      {a.value,
+       graph.add_scalar<int64_t>(a_range),
+       graph.add_scalar<int64_t>(a_src_offset),
+       graph.add_scalar<int64_t>(a_dst_offset),
+       out.value});
+
+  // b will write to channel [6, 11)
+  // Intentional for b to override channel=6
+  int64_t b_src_offset = 0;
+  int64_t b_dst_offset = 6;
+  int64_t b_range = 5;
+
+  copyFn(
+      graph,
+      {b.value,
+       graph.add_scalar<int64_t>(b_range),
+       graph.add_scalar<int64_t>(b_src_offset),
+       graph.add_scalar<int64_t>(b_dst_offset),
+       out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  float a_value = 1.0f;
+  float b_value = 2.0f;
+  float zero_value = 0.0f;
+  fill_vtensor(graph, a, a_value);
+  fill_vtensor(graph, b, b_value);
+  fill_vtensor(graph, zero, zero_value);
+
+  graph.execute();
+
+  EXTRACT_TENSOR(out);
+
+  for (int n_idx = 0; n_idx < n; n_idx++) {
+    // c_idx only up to a_range-1 because the expected overwrite by b
+    for (int c_idx = a_dst_offset; c_idx < a_dst_offset + a_range - 1;
+         c_idx++) {
+      for (int h_idx = 0; h_idx < h; h_idx++) {
+        for (int w_idx = 0; w_idx < w; w_idx++) {
+          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
+          EXPECT_TRUE(data_out[dst_idx] == a_value);
+        }
+      }
+    }
+  }
+
+  for (int n_idx = 0; n_idx < n; n_idx++) {
+    for (int c_idx = b_dst_offset; c_idx < b_dst_offset + b_range; c_idx++) {
+      for (int h_idx = 0; h_idx < h; h_idx++) {
+        for (int w_idx = 0; w_idx < w; w_idx++) {
+          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
+          EXPECT_TRUE(data_out[dst_idx] == b_value);
+        }
+      }
+    }
+  }
+
+  // Also verify that data before a_dst_offset and after b_dst_offset + b_range
+  // are untouched.
+  for (int n_idx = 0; n_idx < n; n_idx++) {
+    for (int c_idx = 0; c_idx < a_dst_offset; c_idx++) {
+      for (int h_idx = 0; h_idx < h; h_idx++) {
+        for (int w_idx = 0; w_idx < w; w_idx++) {
+          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
+          EXPECT_TRUE(data_out[dst_idx] == zero_value);
+        }
+      }
+    }
+  }
+
+  for (int n_idx = 0; n_idx < n; n_idx++) {
+    for (int c_idx = b_dst_offset + b_range; c_idx < c; c_idx++) {
+      for (int h_idx = 0; h_idx < h; h_idx++) {
+        for (int w_idx = 0; w_idx < w; w_idx++) {
+          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
+          EXPECT_TRUE(data_out[dst_idx] == zero_value);
+        }
+      }
+    }
+  }
+}
+
 class VulkanToFromGPUShaderTest : public ::testing::Test {
  public:
   void SetUp() override {