diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl new file mode 100644 index 00000000000..78e698fa7e5 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl @@ -0,0 +1,78 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" + +layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; +layout(set = 0, binding = 1) uniform PRECISION sampler3D existing_out; +layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in; + +layout(set = 0, binding = 3) uniform PRECISION restrict CopyArgs { + ivec4 out_sizes; + ivec4 in_sizes; + // Analogus to range variable in copy. It defines the # of channel being + // copied. + int channel_range; + int src_channel_offset; + int dst_channel_offset; + int unused; + // Operates on (x, y, z) extents. + ivec3 range; + int unused1; + ivec3 dst_offset; + int unused2; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int packed_dim = C_DIM; + +void main() { + // Note: Unlike other shaders, the range is often not equal to the destination + // texture extent. + const ivec3 pos = ivec3(gl_GlobalInvocationID); + if (any(greaterThanEqual(pos, range))) { + return; + } + + const ivec3 out_pos = pos + dst_offset; + + const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim); + + // First read the existing values to make sure the boundary values stay. + VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0)); + + for (int i=0; i<4; i++) { + ivec4 in_whcn = out_whcn; + + in_whcn.z = out_whcn.z - dst_channel_offset + i; + + // Handle the partial update for begining of channel in an existing tensor. + // If the source channel index is below zero or exceeds the range, we skip + // updating the element to avoid overwriting existing data. + if ((in_whcn.z < 0) || (in_whcn.z >= channel_range)) { + continue; + } + + // Readjust for the source offset. + in_whcn.z = in_whcn.z + src_channel_offset; + + ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim); + v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w]; + } + + imageStore(image_out, out_pos, v); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml new file mode 100644 index 00000000000..3887647ff83 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml @@ -0,0 +1,10 @@ +copy_channel_offset: + parameter_names_with_default_values: + DTYPE: float + NDIM: 3 + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: copy_channel_offset diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl index 17b3e06e61e..0d1d3420a52 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl @@ -10,26 +10,12 @@ #define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} - layout(std430) buffer; -#include "indexing_utils.h" - layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; -layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict InLimits { - ivec3 in_limits; -}; - - - -layout(set = 0, binding = 4) uniform PRECISION restrict CopyArgs { +layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs { ivec3 range; int unused0; ivec3 src_offset; diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index 0a5e20e4f7c..d599a00c2eb 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -8,38 +8,39 @@ #include +#include #include #include #include namespace vkcompute { +using api::utils::ivec3; +using api::utils::uvec3; + void add_copy_offset_node( ComputeGraph& graph, const ValueRef in, - const api::utils::ivec3& range, - const api::utils::ivec3& src_offset, - const api::utils::ivec3& dst_offset, + const ivec3& range, + const ivec3& src_offset, + const ivec3& dst_offset, const ValueRef out) { vTensorPtr t_in = graph.get_tensor(in); vTensorPtr t_out = graph.get_tensor(out); - VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked)); - std::string kernel_name = "copy_offset"; kernel_name.reserve(kShaderNameReserve); add_dtype_suffix(kernel_name, *t_out); - api::utils::uvec3 global_size = api::utils::make_uvec3(range); - api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + uvec3 global_size = api::utils::make_uvec3(range); + uvec3 local_size = adaptive_work_group_size(global_size); const struct Block final { - api::utils::ivec3 range; + ivec3 range; int32_t unused0; - api::utils::ivec3 src_offset; + ivec3 src_offset; int32_t unused1; - api::utils::ivec3 dst_offset; + ivec3 dst_offset; int32_t unused2; } offset_params{ range, @@ -58,13 +59,166 @@ void add_copy_offset_node( global_size, local_size, // Inputs and Outputs - {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}}, + { + {out, api::MemoryAccessType::WRITE}, + {in, api::MemoryAccessType::READ}, + }, // Parameter buffers - {t_out->texture_limits_ubo(), - t_in->texture_limits_ubo(), - graph.create_params_buffer(offset_params)}, + {graph.create_params_buffer(offset_params)}, // Specialization Constants {})); } +void add_copy_channel_offset_node( + ComputeGraph& graph, + const ValueRef in, + int32_t channel_range, + int32_t src_channel_offset, + int32_t dst_channel_offset, + const ValueRef out) { + vTensorPtr t_in = graph.get_tensor(in); + vTensorPtr t_out = graph.get_tensor(out); + + // Likely need to prepad these numbers. + std::vector in_sizes = t_in->sizes(); + std::vector out_sizes = t_out->sizes(); + + VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked)); + VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked)); + + // NOTE: This function should be able to support 1d and 2d tensors when + // range=1, src_offset=dst_offset=1. + VK_CHECK_COND(t_in->dim() >= 3, "Src dim should be at least 3"); + VK_CHECK_COND(t_out->dim() >= 3, "Dst dim should be at least 3"); + + VK_CHECK_COND( + dim_at(in_sizes) >= src_channel_offset + channel_range, + "Source channel plus range should be less than or equal to input tensor's channel size"); + VK_CHECK_COND( + dim_at(out_sizes) >= dst_channel_offset + channel_range, + "Source channel and range should be less than or equal to input tensor's channel size"); + + VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative"); + VK_CHECK_COND( + src_channel_offset >= 0, "Src channel offset must be non-negative"); + VK_CHECK_COND( + dst_channel_offset >= 0, "Dst channel offset must be non-negative"); + + std::string kernel_name = "copy_channel_offset"; + kernel_name.reserve(kShaderNameReserve); + add_dtype_suffix(kernel_name, *t_out); + + int32_t out_channels = dim_at(out_sizes); + + // Copy one batch at a time. + for (int batch_idx = 0; batch_idx < dim_at(in_sizes); + batch_idx++) { + // Mapping the tensor NCHW coordinates into texture XYZ coordinates + int32_t dst_first_z = dst_channel_offset / 4; + int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4; + + // We copy the entire width and height dimension. For the channel dimension, + // we use the z-dimension of the global_size to specify the texture range. + // The shader combines the global invocation id and the dst_offset to get + // the actual coordinate. + + ivec3 dst_offset{ + 0, 0, dst_first_z + batch_idx * api::utils::div_up(out_channels, 4)}; + + uvec3 global_size{ + dim_at(in_sizes), + dim_at(in_sizes), + api::utils::safe_downcast(dst_last_z - dst_first_z + 1)}; + + uvec3 local_size = adaptive_work_group_size(global_size); + + const struct Block final { + api::utils::ivec4 out_sizes; + api::utils::ivec4 in_sizes; + int32_t channel_range; + int32_t src_channel_offset; + int32_t dst_channel_offset; + int32_t unused; + ivec3 range; + int32_t unused1; + ivec3 dst_offset; + int32_t unused2; + + } channel_offset_params{ + api::utils::make_whcn_ivec4(out_sizes), + api::utils::make_whcn_ivec4(in_sizes), + channel_range, + src_channel_offset, + dst_channel_offset, + 0, + api::utils::make_ivec3(global_size), + 0, + dst_offset, + 0, + }; + + auto shader = VK_KERNEL_FROM_STR(kernel_name); + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_size, + local_size, + // Inputs and Outputs + { + {out, api::MemoryAccessType::WRITE}, + {out, api::MemoryAccessType::READ}, + {in, api::MemoryAccessType::READ}, + }, + // Parameter buffers + {graph.create_params_buffer(channel_offset_params)}, + // Specialization Constants + {})); + } +} + +void add_copy_offset_node( + ComputeGraph& graph, + ValueRef in, + ValueRef range_ref, + ValueRef src_offset_ref, + ValueRef dst_offset_ref, + ValueRef out) { + ivec3 range = api::utils::make_ivec3(*graph.get_int_list(range_ref)); + ivec3 src_offset = + api::utils::make_ivec3(*graph.get_int_list(src_offset_ref)); + ivec3 dst_offset = + api::utils::make_ivec3(*graph.get_int_list(dst_offset_ref)); + + add_copy_offset_node(graph, in, range, src_offset, dst_offset, out); +} + +void copy_offset(ComputeGraph& graph, const std::vector& args) { + add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]); +} + +void copy_channel_offset( + ComputeGraph& graph, + const std::vector& args) { + ValueRef in = args[0]; + ValueRef channel_range_ref = args[1]; + ValueRef src_channel_offset_ref = args[2]; + ValueRef dst_channel_offset_ref = args[3]; + ValueRef out = args[4]; + + auto channel_range = graph.extract_scalar(channel_range_ref); + auto src_channel_offset = + graph.extract_scalar(src_channel_offset_ref); + auto dst_channel_offset = + graph.extract_scalar(dst_channel_offset_ref); + + add_copy_channel_offset_node( + graph, in, channel_range, src_channel_offset, dst_channel_offset, out); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(etvk.copy_offset, copy_offset); + VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset); +} + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h index 6e0deb6b74e..60a58b2fa84 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.h +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.h @@ -14,6 +14,13 @@ namespace vkcompute { +// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the +// texture extents specified by the range, src_offset, and dst_offset (all are +// in texture coordinate (x, y, z) from the input image to the output image. +// +// It is possible to have input and output to point to the same image +// object. But when the source range and destination range overlap, the behavior +// is undefined. void add_copy_offset_node( ComputeGraph& graph, const ValueRef in, @@ -22,4 +29,25 @@ void add_copy_offset_node( const api::utils::ivec3& dst_offset, const ValueRef out); +// add_copy_channel_offset_node behaves similar to add_copy_node, except that it +// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW). +// The range and offset arguments are in the tensor coordinate. It assumes the +// underlying texture is channel-packed. +// +// This function is specialized implementation for copying +// channel packed values. The complication comes from when reading / writing the +// channel dimension on indices that are not aligned to packing, we will need +// be careful about the boundaries. +// +// It achieves the following: +// out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] = +// in [:, src_channel_offset:src_channel_offset + channel_range, :, :] +void add_copy_channel_offset_node( + ComputeGraph& graph, + const ValueRef in, + int32_t channel_range, + int32_t src_channel_offset, + int32_t dst_channel_offset, + const ValueRef out); + } // namespace vkcompute diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index a1f3b93dc3a..87f9d2a915d 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -12,6 +12,7 @@ #include +#include #include #include @@ -153,6 +154,26 @@ check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) { } } +inline int64_t get_buf_idx( + ComputeGraph& graph, + IOValueRef ref, + const std::vector& tensor_coor) { + vTensorPtr vten_ptr = graph.get_tensor(ref.value); + + const std::vector& sizes = vten_ptr->sizes(); + + int64_t c = dim_at(sizes); + int64_t h = dim_at(sizes); + int64_t w = dim_at(sizes); + + int64_t ni = dim_at(tensor_coor); + int64_t ci = dim_at(tensor_coor); + int64_t hi = dim_at(tensor_coor); + int64_t wi = dim_at(tensor_coor); + + return (ni * c * h * w + ci * h * w + hi * w + wi); +} + // // Context Management // diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 4955d0537ee..abd8b725b43 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -876,6 +876,274 @@ TEST(VulkanComputeGraphTest, test_large_graph) { } } +TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) { + GraphConfig config; + ComputeGraph graph(config); + + int64_t n = 6; + int64_t c = 12; + int64_t h = 4; + int64_t w = 8; + api::GPUMemoryLayout memory_layout = + api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; + + std::vector size = {n, c, h, w}; + + IOValueRef a = graph.add_input_tensor(size, api::kFloat, memory_layout); + + IOValueRef out = {}; + out.value = graph.add_tensor(size, api::kFloat, memory_layout); + + // Notice that copy_node operates on in texture's x, y, z dimension. In the + // comment, we provide the cooresponding coordinate in nchw. + + // src_offset is (n=0, c=4, h=1, w=1) + ValueRef src_offset_ref = graph.add_scalar_list({1, 1, 1}); + + // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate + // Argument is {x, y, z}. + // x = 0 since w = 0 + // y = 2 since h = 2 + // z = c / 4 + 2 since + // 1. there c/4 planes per batch, n=1 means we are on the first batch; + // 2. +2 because c = 8, with channel packing it means two texels. + ValueRef dst_offset_ref = graph.add_scalar_list({0, 2, c / 4 + 2}); + + // range is (n=1, c=8, h=2, w=4) + // Argument is {x, y, z}. + // x = 4 since w = 4 + // y = 2 since h = 2 + // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a + // bit misleading here, since it gives the impression that we are copying the + // entire channel. However, remember when we copy, we are trying to + // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range], + // range must be non zero. + ValueRef range_ref = graph.add_scalar_list({4, 2, 2}); + + auto copyFn = VK_GET_OP_FN("etvk.copy_offset"); + copyFn( + graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + fill_vtensor(graph, a, 0.0f, /*iota = */ true); + + graph.execute(); + + EXTRACT_TENSOR(out); + EXTRACT_TENSOR(a); + + // We will examine the results in the dst_range + // The value in the cooresponding coordinate should match between the source + // and destination tensor. We loop thru the range, calculate both the src and + // dst index using the offsets, and compare the values in the extracted + // vector. They should match. + int n_idx = 0; + // at each nested loop, index range from dst_offset to dst_offset + range + + for (int c_idx = 0; c_idx < 8; c_idx++) { + for (int h_idx = 0; h_idx < 2; h_idx++) { + for (int w_idx = 0; w_idx < 4; w_idx++) { + auto dst_idx = + get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx}); + auto src_idx = + get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1}); + + EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); + } + } + } +} + +TEST(VulkanComputeGraphTest, test_etvk_copy_channel_offset_node) { + GraphConfig config; + ComputeGraph graph(config); + + int64_t n = 2; + int64_t c = 12; + int64_t h = 4; + int64_t w = 8; + api::GPUMemoryLayout memory_layout = + api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; + + std::vector size = {n, c, h, w}; + + IOValueRef a = graph.add_input_tensor(size, api::kFloat, memory_layout); + + IOValueRef out = {}; + out.value = graph.add_tensor(size, api::kFloat, memory_layout); + + int64_t src_offset = 2; + int64_t dst_offset = 3; + int64_t range = 7; + + ValueRef src_offset_ref = graph.add_scalar(src_offset); + ValueRef dst_offset_ref = graph.add_scalar(dst_offset); + ValueRef range_ref = graph.add_scalar(range); + + auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset"); + copyFn( + graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value}); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + fill_vtensor(graph, a, 0.0f, true); + + graph.execute(); + + EXTRACT_TENSOR(out); + EXTRACT_TENSOR(a); + + for (int n_idx = 0; n_idx < n; n_idx++) { + for (int c_idx = 0; c_idx < range; c_idx++) { + for (int h_idx = 0; h_idx < h; h_idx++) { + for (int w_idx = 0; w_idx < w; w_idx++) { + auto src_idx = + get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx}); + auto dst_idx = get_buf_idx( + graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx}); + EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]); + } + } + } + } +} + +TEST( + VulkanComputeGraphTest, + test_etvk_copy_channel_offset_node_clean_boundary) { + // Tricky part for channel copy is handling the boundary across multiple copy. + // For example, when we concat two [3, 1, 1] nchw-tensors along the channel + // dimension, due to channel packing, elements from different source texel + // will be packed into same destination texel at the boundaries. + GraphConfig config; + ComputeGraph graph(config); + + int64_t n = 2; + int64_t c = 12; + int64_t h = 4; + int64_t w = 8; + api::GPUMemoryLayout memory_layout = + api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED; + + std::vector size = {n, c, h, w}; + + IOValueRef zero = graph.add_input_tensor(size, api::kFloat, memory_layout); + IOValueRef a = graph.add_input_tensor(size, api::kFloat, memory_layout); + IOValueRef b = graph.add_input_tensor(size, api::kFloat, memory_layout); + + IOValueRef out = {}; + out.value = graph.add_tensor(size, api::kFloat, memory_layout); + + auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset"); + + // Make sure entire out tensor is zeroed. The zero tensor will be filled with + // zero later. + copyFn( + graph, + {zero.value, + graph.add_scalar(c), + graph.add_scalar(0), + graph.add_scalar(0), + out.value}); + + int64_t a_src_offset = 0; + int64_t a_dst_offset = 2; + int64_t a_range = 5; + // a will write to channge [2, 7) + copyFn( + graph, + {a.value, + graph.add_scalar(a_range), + graph.add_scalar(a_src_offset), + graph.add_scalar(a_dst_offset), + out.value}); + + // b will write to channel [6, 11) + // Intentional for b to override channel=6 + int64_t b_src_offset = 0; + int64_t b_dst_offset = 6; + int64_t b_range = 5; + + copyFn( + graph, + {b.value, + graph.add_scalar(b_range), + graph.add_scalar(b_src_offset), + graph.add_scalar(b_dst_offset), + out.value}); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + float a_value = 1.0f; + float b_value = 2.0f; + float zero_value = 0.0f; + fill_vtensor(graph, a, a_value); + fill_vtensor(graph, b, b_value); + fill_vtensor(graph, zero, zero_value); + + graph.execute(); + + EXTRACT_TENSOR(out); + + for (int n_idx = 0; n_idx < n; n_idx++) { + // c_idx only up to a_range-1 because the expected overwrite by b + for (int c_idx = a_dst_offset; c_idx < a_dst_offset + a_range - 1; + c_idx++) { + for (int h_idx = 0; h_idx < h; h_idx++) { + for (int w_idx = 0; w_idx < w; w_idx++) { + auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); + EXPECT_TRUE(data_out[dst_idx] == a_value); + } + } + } + } + + for (int n_idx = 0; n_idx < n; n_idx++) { + for (int c_idx = b_dst_offset; c_idx < b_dst_offset + b_range; c_idx++) { + for (int h_idx = 0; h_idx < h; h_idx++) { + for (int w_idx = 0; w_idx < w; w_idx++) { + auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); + EXPECT_TRUE(data_out[dst_idx] == b_value); + } + } + } + } + + // Also verify that data before a_dst_offset and after b_dst_offset + b_range + // are untouched. + for (int n_idx = 0; n_idx < n; n_idx++) { + for (int c_idx = 0; c_idx < a_dst_offset; c_idx++) { + for (int h_idx = 0; h_idx < h; h_idx++) { + for (int w_idx = 0; w_idx < w; w_idx++) { + auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); + EXPECT_TRUE(data_out[dst_idx] == zero_value); + } + } + } + } + + for (int n_idx = 0; n_idx < n; n_idx++) { + for (int c_idx = b_dst_offset + b_range; c_idx < c; c_idx++) { + for (int h_idx = 0; h_idx < h; h_idx++) { + for (int w_idx = 0; w_idx < w; w_idx++) { + auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx}); + EXPECT_TRUE(data_out[dst_idx] == zero_value); + } + } + } + } +} + class VulkanToFromGPUShaderTest : public ::testing::Test { public: void SetUp() override {