diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h new file mode 100644 index 00000000000..c623fdb4c31 --- /dev/null +++ b/kernels/portable/cpu/util/broadcast_indexes_range.h @@ -0,0 +1,209 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace torch::executor { + +namespace internal { +template +class BroadcastIndexesIterator { + public: + using difference_type = ssize_t; + using value_type = std::array; + using reference = const value_type&; + using pointer = const value_type*; + using iterator_category = std::forward_iterator_tag; + + BroadcastIndexesIterator() = default; + + template + explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args) + : output_dim_(output.dim()), + output_shape_(output.sizes()), + effective_input_broadcast_strides_{ + effective_input_broadcast_stride(output, args)...} { + static_assert( + sizeof...(args) == kNumInputs && (std::is_same_v && ...), + "BroadcastIndexesIterator constructor requires kNumInputs input tensor" + "arguments!"); + } + + struct make_end_t { + explicit constexpr make_end_t() = default; + }; + + template + BroadcastIndexesIterator(make_end_t, const Tensor& t, const Args&... args) + : current_indexes_{ + t.numel(), + 0, + } {} + + bool operator==(const BroadcastIndexesIterator& rhs) const { + return output_index() == rhs.output_index(); + } + + bool operator!=(const BroadcastIndexesIterator& rhs) const { + return !operator==(rhs); + } + + reference operator*() const { + return current_indexes_; + } + + pointer operator->() const { + return ¤t_indexes_; + } + + BroadcastIndexesIterator& operator++() { + output_index()++; + // TODO: add optimization for particular input tensors not being + // broadcasted? + for (auto ii = output_dim_ - 1; ii >= 0; --ii) { + // You might wonder what happens if output_shape_[ii] == 0. In + // that case, output.numel() would be 0, and thus we would have + // begin() == end() and no iteration. + if ET_UNLIKELY (delinearized_output_index_[ii] == output_shape_[ii] - 1) { + const auto old_delinearized_output_index_item = + delinearized_output_index_[ii]; + delinearized_output_index_[ii] = 0; + for (const auto jj : c10::irange(1, kNumInputs + 1)) { + current_indexes_[jj] -= old_delinearized_output_index_item * + effective_input_broadcast_strides_[jj - 1][ii]; + } + } else { + delinearized_output_index_[ii]++; + for (const auto jj : c10::irange(1, kNumInputs + 1)) { + current_indexes_.at(jj) += + effective_input_broadcast_strides_[jj - 1][ii]; + } + break; + } + } + return *this; + } + + BroadcastIndexesIterator operator++(int) { + auto it = *this; + operator++(); + return it; + } + + difference_type operator-(const BroadcastIndexesIterator& rhs) const { + return difference_type(output_index() - rhs.output_index()); + } + + private: + ssize_t output_index() const { + return current_indexes_[0]; + } + + ssize_t& output_index() { + return current_indexes_[0]; + } + + std::array + effective_input_broadcast_stride(const Tensor& output, const Tensor& t) + const { + std::array + result = {0}; + ET_CHECK_MSG( + t.dim() <= output.dim(), + "input to broadcasting op should have dim at most output dim, but %d > %d!", + (int)t.dim(), + (int)output.dim()); + + const auto num_leading_ones = output.dim() - t.dim(); + for (const auto idx : c10::irange(num_leading_ones)) { + result[idx] = 0; + } + const auto t_sizes = t.sizes(); + const auto t_strides = t.strides(); + for (const auto idx : + c10::irange(num_leading_ones, num_leading_ones + t.dim())) { + result[idx] = t_sizes[idx - num_leading_ones] == 1 + ? 0 + : t_strides[idx - num_leading_ones]; + } + return result; + } + + // The 0th entry is the current linear index into the output, + // followed by kNumInputs input indexes. + std::array current_indexes_ = {0}; + using ShapeType = std:: + array; + ShapeType delinearized_output_index_ = {0}; + ssize_t output_dim_; + ArrayRef output_shape_; + // The linear index for a broadcast tensor is + // sum(delinearized_output_index_[i] * input_stride_[i] if + // padded_input_shape_[i] != 1 else 0), where padded_input_shape is + // input.sizes() with leading 1s added to make its size equal to + // output_dim. This is straightforwardly implementable with an + // adjusted stride array that contains 0s where the padded input + // shape would contain 1s. + std::array effective_input_broadcast_strides_ = { + {{0}}}; +}; +} // namespace internal + +/** + * Efficient mechanism for looping over the index space for an output + * tensor and kNumInputs possibly-broadcasted input tensors. Use as follows: + * + * auto* output_data = output.mutable_data_ptr(); + * const auto* a_data = a.mutable_data_ptr(); + * const auto* b_data = b.mutable_data_ptr(); + * for (const auto [output_index, a_index, b_index] : + * BroadcastIndexesRange<2>(output, a, b)) { + * // Access output_data[output_index], a_data[a_index], and b_data[b_index]. + * } + * + * (where OutputType, AType, and BType are known concrete types.) + * + * Unlike looping using delinearize_index() and + * linearize_access_indexes(), BroadcastIndexesRange avoids expensive + * division and modulo operations on each iteration. + */ +template +class BroadcastIndexesRange { + public: + using iterator = internal::BroadcastIndexesIterator; + + template + BroadcastIndexesRange(const Tensor& output, const Args&... args) + : tensors_{&output, (&args)...} {} + + iterator begin() const { + return std::apply( + [](const auto&... args) { return iterator((*args)...); }, tensors_); + } + + iterator end() const { + return std::apply( + [](const auto&... args) { + return iterator(typename iterator::make_end_t(), (*args)...); + }, + tensors_); + } + + private: + std::array tensors_; +}; +} // namespace torch::executor diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index 2b22687274f..c42f38fd8b0 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -278,6 +278,19 @@ def define_common_targets(): visibility = ["//executorch/kernels/portable/cpu/..."], ) + runtime.cxx_library( + name = "broadcast_indexes_range", + exported_headers = ["broadcast_indexes_range.h"], + deps = [ + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/exec_aten/util:tensor_dimension_limit", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + ) + # Utility functions that can be used by operators that perform reduction for aten_mode in get_aten_mode_options(): suffix = "_aten" if aten_mode else "" diff --git a/kernels/portable/cpu/util/test/CMakeLists.txt b/kernels/portable/cpu/util/test/CMakeLists.txt index 5f81e4b6aec..b92e8ebfae1 100644 --- a/kernels/portable/cpu/util/test/CMakeLists.txt +++ b/kernels/portable/cpu/util/test/CMakeLists.txt @@ -19,7 +19,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs broadcast_test.cpp reduce_test.cpp) +set(_test_srcs broadcast_indexes_range_test.cpp broadcast_test.cpp + reduce_test.cpp +) et_cxx_test( kernels_portable_cpu_util_test SOURCES ${_test_srcs} EXTRA_LIBS diff --git a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp new file mode 100644 index 00000000000..d1db40fca48 --- /dev/null +++ b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp @@ -0,0 +1,192 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::testing::TensorFactory; +using torch::executor::BroadcastIndexesRange; +using torch::executor::delinearize_index; +using torch::executor::linearize_access_indexes; + +namespace { +template +auto range_to_vec(const Range& rng) { + return std::vector( + rng.begin(), rng.end()); +} +} // namespace +TEST(BroadcastIndexesRangeTest, Empty) { + TensorFactory tf; + + Tensor a = tf.make({0}, {}); + ASSERT_EQ(a.numel(), 0); + bool loop_entered = false; + for (auto _ : BroadcastIndexesRange<1>(a, a)) { + loop_entered = true; + } + EXPECT_FALSE(loop_entered); +} + +// [W] -> [W] +TEST(BroadcastIndexesRangeTest, OneDNotBroadcasted) { + TensorFactory tf; + + Tensor out = tf.zeros({5}); + int idx = 0; + for (const auto& elem : range_to_vec(BroadcastIndexesRange<1>(out, out))) { + EXPECT_EQ(elem[0], idx++); + EXPECT_EQ(elem[0], elem[1]); + } +} + +// [1] -> [W] +TEST(BroadcastIndexesRangeTest, ScalarBroadcastToOneD) { + TensorFactory tf; + + Tensor out = tf.zeros({5}); + Tensor in = tf.zeros({1}); + + auto actual = range_to_vec(BroadcastIndexesRange<1>(out, in)); + decltype(actual) expected = { + {0, 0}, + {1, 0}, + {2, 0}, + {3, 0}, + {4, 0}, + }; + EXPECT_EQ(expected, actual); +} + +// [1] -> [H, W] +// [W] -> [H, W] +// [1, 1] -> [H, W] +// [1, W] -> [H, W] +// [H, 1] -> [H, W] +// [H, W] -> [H, W] +// Cover all these at the same time to also exercise multiple input tensors. +TEST(BroadcastIndexesRangeTest, OneAndTwoDExhaustive) { + TensorFactory tf; + Tensor out = tf.zeros({3, 4}); + Tensor in_0d_scalar = tf.zeros({}); + Tensor in_1d_scalar = tf.zeros({1}); + Tensor in_2d_scalar = tf.zeros({1, 1}); + + Tensor in_row = tf.zeros({4}); + Tensor in_col = tf.zeros({3, 1}); + + Tensor in_not_broadcast = tf.zeros({3, 4}); + + auto actual = range_to_vec(BroadcastIndexesRange<6>( + out, + in_0d_scalar, + in_1d_scalar, + in_2d_scalar, + in_row, + in_col, + in_not_broadcast)); + decltype(actual) expected = { + {0, 0, 0, 0, 0, 0, 0}, + {1, 0, 0, 0, 1, 0, 1}, + {2, 0, 0, 0, 2, 0, 2}, + {3, 0, 0, 0, 3, 0, 3}, + {4, 0, 0, 0, 0, 1, 4}, + {5, 0, 0, 0, 1, 1, 5}, + {6, 0, 0, 0, 2, 1, 6}, + {7, 0, 0, 0, 3, 1, 7}, + {8, 0, 0, 0, 0, 2, 8}, + {9, 0, 0, 0, 1, 2, 9}, + {10, 0, 0, 0, 2, 2, 10}, + {11, 0, 0, 0, 3, 2, 11}, + }; + EXPECT_EQ(expected, actual); +} + +// Here we assume that the previous tests established that padding +// with leading 1s is working, and test: +// [1, 1, 1] -> [C, H, W] +// [C, H, 1] -> [C, H, W] +// [C, 1, W] -> [C, H, W] +// [1, H, W] -> [C, H, W] +// [C, 1, 1] -> [C, H, W] +// [1, H, 1] -> [C, H, W] +// [1, 1, W] -> [C, H, W] +// [C, H, W] -> [C, H, W] +TEST(BroadcastIndexesRangeTest, ThreeDBroadcasting) { + TensorFactory tf; + Tensor out = tf.zeros({2, 3, 4}); + std::array input_tensors = { + tf.zeros({2, 3, 1}), + tf.zeros({2, 1, 4}), + tf.zeros({1, 3, 4}), + tf.zeros({2, 1, 1}), + tf.zeros({1, 3, 1}), + tf.zeros({1, 1, 4}), + tf.zeros({1, 1, 1}), + tf.zeros({2, 3, 4}), + }; + // Writing out all the indexes would be too cumbersome, so here we + // take the opportunity to mutation test against delinearize_index + // and linearize_access_indexes. + int idx = 0; + for (const auto indexes : BroadcastIndexesRange<8>( + out, + input_tensors[0], + input_tensors[1], + input_tensors[2], + input_tensors[3], + input_tensors[4], + input_tensors[5], + input_tensors[6], + input_tensors[7])) { + const auto out_idx = indexes[0]; + EXPECT_EQ(out_idx, idx++); + size_t out_indexes[executorch::runtime::kTensorDimensionLimit]; + delinearize_index( + out_idx, out, out_indexes, executorch::runtime::kTensorDimensionLimit); + for (const auto tensor_idx : c10::irange(0, input_tensors.size())) { + EXPECT_EQ( + indexes[tensor_idx + 1], + linearize_access_indexes( + out_indexes, out.dim(), input_tensors[tensor_idx])); + } + } +} + +// 4-D should generalize, but we will go ahead and test: +// [N, 1, H, 1] -> [N, C, H, W] +// [1, C, 1, W] -> [N, C, H, W] +TEST(BroadcastIndexesRangeTest, FourDBroadcasting) { + TensorFactory tf; + Tensor out = tf.zeros({2, 3, 4, 5}); + Tensor in_broadcast_cw = tf.zeros({2, 1, 4, 1}); + Tensor in_broadcast_nh = tf.zeros({1, 3, 1, 5}); + + // Writing out all the indexes would be too cumbersome, so here we + // take the opportunity to mutation test against delinearize_index + // and linearize_access_indexes. + int idx = 0; + for (const auto [out_idx, in_cw_idx, in_nh_idx] : + BroadcastIndexesRange<2>(out, in_broadcast_cw, in_broadcast_nh)) { + EXPECT_EQ(out_idx, idx++); + size_t out_indexes[executorch::runtime::kTensorDimensionLimit]; + delinearize_index( + out_idx, out, out_indexes, executorch::runtime::kTensorDimensionLimit); + EXPECT_EQ( + in_cw_idx, + linearize_access_indexes(out_indexes, out.dim(), in_broadcast_cw)); + EXPECT_EQ( + in_nh_idx, + linearize_access_indexes(out_indexes, out.dim(), in_broadcast_nh)); + } +} diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl index 28988b90dcc..178eb25a79b 100644 --- a/kernels/portable/cpu/util/test/targets.bzl +++ b/kernels/portable/cpu/util/test/targets.bzl @@ -12,6 +12,17 @@ def define_common_targets(): ], ) + runtime.cxx_test( + name = "broadcast_indexes_range_test", + srcs = ["broadcast_indexes_range_test.cpp"], + deps = [ + "//executorch/kernels/portable/cpu/util:broadcast_util", + "//executorch/kernels/portable/cpu/util:broadcast_indexes_range", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + ], + ) + runtime.cxx_test( name = "reduce_test", srcs = ["reduce_test.cpp"], diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index 70cb2d2e44f..cc5e625f1e8 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -7,7 +7,8 @@ "op_fast_hadamard_transform_test.cpp" ], "additional_libs": [ - "custom_ops" + "custom_ops", + "dumb_fht" ] }, { @@ -61,6 +62,7 @@ { "directory": "kernels/portable/cpu/util/test", "sources": [ + "broadcast_indexes_range_test.cpp", "broadcast_test.cpp", "reduce_test.cpp" ],