diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
new file mode 100644
index 00000000000..c623fdb4c31
--- /dev/null
+++ b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h>
+
+namespace torch::executor {
+
+namespace internal {
+template <std::size_t kNumInputs>
+class BroadcastIndexesIterator {
+ public:
+  using difference_type = ssize_t;
+  using value_type = std::array<ssize_t, kNumInputs + 1>;
+  using reference = const value_type&;
+  using pointer = const value_type*;
+  using iterator_category = std::forward_iterator_tag;
+
+  BroadcastIndexesIterator() = default;
+
+  template <typename... Args>
+  explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
+      : output_dim_(output.dim()),
+        output_shape_(output.sizes()),
+        effective_input_broadcast_strides_{
+            effective_input_broadcast_stride(output, args)...} {
+    static_assert(
+        sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),
+        "BroadcastIndexesIterator constructor requires kNumInputs input tensor"
+        "arguments!");
+  }
+
+  struct make_end_t {
+    explicit constexpr make_end_t() = default;
+  };
+
+  template <typename... Args>
+  BroadcastIndexesIterator(make_end_t, const Tensor& t, const Args&... args)
+      : current_indexes_{
+            t.numel(),
+            0,
+        } {}
+
+  bool operator==(const BroadcastIndexesIterator& rhs) const {
+    return output_index() == rhs.output_index();
+  }
+
+  bool operator!=(const BroadcastIndexesIterator& rhs) const {
+    return !operator==(rhs);
+  }
+
+  reference operator*() const {
+    return current_indexes_;
+  }
+
+  pointer operator->() const {
+    return &current_indexes_;
+  }
+
+  BroadcastIndexesIterator& operator++() {
+    output_index()++;
+    // TODO: add optimization for particular input tensors not being
+    // broadcasted?
+    for (auto ii = output_dim_ - 1; ii >= 0; --ii) {
+      // You might wonder what happens if output_shape_[ii] == 0. In
+      // that case, output.numel() would be 0, and thus we would have
+      // begin() == end() and no iteration.
+      if ET_UNLIKELY (delinearized_output_index_[ii] == output_shape_[ii] - 1) {
+        const auto old_delinearized_output_index_item =
+            delinearized_output_index_[ii];
+        delinearized_output_index_[ii] = 0;
+        for (const auto jj : c10::irange(1, kNumInputs + 1)) {
+          current_indexes_[jj] -= old_delinearized_output_index_item *
+              effective_input_broadcast_strides_[jj - 1][ii];
+        }
+      } else {
+        delinearized_output_index_[ii]++;
+        for (const auto jj : c10::irange(1, kNumInputs + 1)) {
+          current_indexes_.at(jj) +=
+              effective_input_broadcast_strides_[jj - 1][ii];
+        }
+        break;
+      }
+    }
+    return *this;
+  }
+
+  BroadcastIndexesIterator operator++(int) {
+    auto it = *this;
+    operator++();
+    return it;
+  }
+
+  difference_type operator-(const BroadcastIndexesIterator& rhs) const {
+    return difference_type(output_index() - rhs.output_index());
+  }
+
+ private:
+  ssize_t output_index() const {
+    return current_indexes_[0];
+  }
+
+  ssize_t& output_index() {
+    return current_indexes_[0];
+  }
+
+  std::array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>
+  effective_input_broadcast_stride(const Tensor& output, const Tensor& t)
+      const {
+    std::array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>
+        result = {0};
+    ET_CHECK_MSG(
+        t.dim() <= output.dim(),
+        "input to broadcasting op should have dim at most output dim, but %d > %d!",
+        (int)t.dim(),
+        (int)output.dim());
+
+    const auto num_leading_ones = output.dim() - t.dim();
+    for (const auto idx : c10::irange(num_leading_ones)) {
+      result[idx] = 0;
+    }
+    const auto t_sizes = t.sizes();
+    const auto t_strides = t.strides();
+    for (const auto idx :
+         c10::irange(num_leading_ones, num_leading_ones + t.dim())) {
+      result[idx] = t_sizes[idx - num_leading_ones] == 1
+          ? 0
+          : t_strides[idx - num_leading_ones];
+    }
+    return result;
+  }
+
+  // The 0th entry is the current linear index into the output,
+  // followed by kNumInputs input indexes.
+  std::array<ssize_t, kNumInputs + 1> current_indexes_ = {0};
+  using ShapeType = std::
+      array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>;
+  ShapeType delinearized_output_index_ = {0};
+  ssize_t output_dim_;
+  ArrayRef<exec_aten::SizesType> output_shape_;
+  // The linear index for a broadcast tensor is
+  // sum(delinearized_output_index_[i] * input_stride_[i] if
+  // padded_input_shape_[i] != 1 else 0), where padded_input_shape is
+  // input.sizes() with leading 1s added to make its size equal to
+  // output_dim. This is straightforwardly implementable with an
+  // adjusted stride array that contains 0s where the padded input
+  // shape would contain 1s.
+  std::array<ShapeType, kNumInputs> effective_input_broadcast_strides_ = {
+      {{0}}};
+};
+} // namespace internal
+
+/**
+ * Efficient mechanism for looping over the index space for an output
+ * tensor and kNumInputs possibly-broadcasted input tensors. Use as follows:
+ *
+ * auto* output_data = output.mutable_data_ptr<OutputType>();
+ * const auto* a_data = a.mutable_data_ptr<AType>();
+ * const auto* b_data = b.mutable_data_ptr<BType>();
+ * for (const auto [output_index, a_index, b_index] :
+ *      BroadcastIndexesRange<2>(output, a, b)) {
+ *   // Access output_data[output_index], a_data[a_index], and b_data[b_index].
+ * }
+ *
+ * (where OutputType, AType, and BType are known concrete types.)
+ *
+ * Unlike looping using delinearize_index() and
+ * linearize_access_indexes(), BroadcastIndexesRange avoids expensive
+ * division and modulo operations on each iteration.
+ */
+template <std::size_t kNumInputs>
+class BroadcastIndexesRange {
+ public:
+  using iterator = internal::BroadcastIndexesIterator<kNumInputs>;
+
+  template <typename... Args>
+  BroadcastIndexesRange(const Tensor& output, const Args&... args)
+      : tensors_{&output, (&args)...} {}
+
+  iterator begin() const {
+    return std::apply(
+        [](const auto&... args) { return iterator((*args)...); }, tensors_);
+  }
+
+  iterator end() const {
+    return std::apply(
+        [](const auto&... args) {
+          return iterator(typename iterator::make_end_t(), (*args)...);
+        },
+        tensors_);
+  }
+
+ private:
+  std::array<const Tensor*, kNumInputs + 1> tensors_;
+};
+} // namespace torch::executor
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 2b22687274f..c42f38fd8b0 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -278,6 +278,19 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "broadcast_indexes_range",
+        exported_headers = ["broadcast_indexes_range.h"],
+        deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/util:tensor_dimension_limit",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     # Utility functions that can be used by operators that perform reduction
     for aten_mode in get_aten_mode_options():
         suffix = "_aten" if aten_mode else ""
diff --git a/kernels/portable/cpu/util/test/CMakeLists.txt b/kernels/portable/cpu/util/test/CMakeLists.txt
index 5f81e4b6aec..b92e8ebfae1 100644
--- a/kernels/portable/cpu/util/test/CMakeLists.txt
+++ b/kernels/portable/cpu/util/test/CMakeLists.txt
@@ -19,7 +19,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs broadcast_test.cpp reduce_test.cpp)
+set(_test_srcs broadcast_indexes_range_test.cpp broadcast_test.cpp
+               reduce_test.cpp
+)
 
 et_cxx_test(
   kernels_portable_cpu_util_test SOURCES ${_test_srcs} EXTRA_LIBS
diff --git a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
new file mode 100644
index 00000000000..d1db40fca48
--- /dev/null
+++ b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+
+#include <gtest/gtest.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
+using torch::executor::BroadcastIndexesRange;
+using torch::executor::delinearize_index;
+using torch::executor::linearize_access_indexes;
+
+namespace {
+template <typename Range>
+auto range_to_vec(const Range& rng) {
+  return std::vector<typename Range::iterator::value_type>(
+      rng.begin(), rng.end());
+}
+} // namespace
+TEST(BroadcastIndexesRangeTest, Empty) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor a = tf.make({0}, {});
+  ASSERT_EQ(a.numel(), 0);
+  bool loop_entered = false;
+  for (auto _ : BroadcastIndexesRange<1>(a, a)) {
+    loop_entered = true;
+  }
+  EXPECT_FALSE(loop_entered);
+}
+
+// [W] -> [W]
+TEST(BroadcastIndexesRangeTest, OneDNotBroadcasted) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor out = tf.zeros({5});
+  int idx = 0;
+  for (const auto& elem : range_to_vec(BroadcastIndexesRange<1>(out, out))) {
+    EXPECT_EQ(elem[0], idx++);
+    EXPECT_EQ(elem[0], elem[1]);
+  }
+}
+
+// [1] -> [W]
+TEST(BroadcastIndexesRangeTest, ScalarBroadcastToOneD) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor out = tf.zeros({5});
+  Tensor in = tf.zeros({1});
+
+  auto actual = range_to_vec(BroadcastIndexesRange<1>(out, in));
+  decltype(actual) expected = {
+      {0, 0},
+      {1, 0},
+      {2, 0},
+      {3, 0},
+      {4, 0},
+  };
+  EXPECT_EQ(expected, actual);
+}
+
+// [1] -> [H, W]
+// [W] -> [H, W]
+// [1, 1] -> [H, W]
+// [1, W] -> [H, W]
+// [H, 1] -> [H, W]
+// [H, W] -> [H, W]
+// Cover all these at the same time to also exercise multiple input tensors.
+TEST(BroadcastIndexesRangeTest, OneAndTwoDExhaustive) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor out = tf.zeros({3, 4});
+  Tensor in_0d_scalar = tf.zeros({});
+  Tensor in_1d_scalar = tf.zeros({1});
+  Tensor in_2d_scalar = tf.zeros({1, 1});
+
+  Tensor in_row = tf.zeros({4});
+  Tensor in_col = tf.zeros({3, 1});
+
+  Tensor in_not_broadcast = tf.zeros({3, 4});
+
+  auto actual = range_to_vec(BroadcastIndexesRange<6>(
+      out,
+      in_0d_scalar,
+      in_1d_scalar,
+      in_2d_scalar,
+      in_row,
+      in_col,
+      in_not_broadcast));
+  decltype(actual) expected = {
+      {0, 0, 0, 0, 0, 0, 0},
+      {1, 0, 0, 0, 1, 0, 1},
+      {2, 0, 0, 0, 2, 0, 2},
+      {3, 0, 0, 0, 3, 0, 3},
+      {4, 0, 0, 0, 0, 1, 4},
+      {5, 0, 0, 0, 1, 1, 5},
+      {6, 0, 0, 0, 2, 1, 6},
+      {7, 0, 0, 0, 3, 1, 7},
+      {8, 0, 0, 0, 0, 2, 8},
+      {9, 0, 0, 0, 1, 2, 9},
+      {10, 0, 0, 0, 2, 2, 10},
+      {11, 0, 0, 0, 3, 2, 11},
+  };
+  EXPECT_EQ(expected, actual);
+}
+
+// Here we assume that the previous tests established that padding
+// with leading 1s is working, and test:
+// [1, 1, 1] -> [C, H, W]
+// [C, H, 1] -> [C, H, W]
+// [C, 1, W] -> [C, H, W]
+// [1, H, W] -> [C, H, W]
+// [C, 1, 1] -> [C, H, W]
+// [1, H, 1] -> [C, H, W]
+// [1, 1, W] -> [C, H, W]
+// [C, H, W] -> [C, H, W]
+TEST(BroadcastIndexesRangeTest, ThreeDBroadcasting) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor out = tf.zeros({2, 3, 4});
+  std::array<Tensor, 8> input_tensors = {
+      tf.zeros({2, 3, 1}),
+      tf.zeros({2, 1, 4}),
+      tf.zeros({1, 3, 4}),
+      tf.zeros({2, 1, 1}),
+      tf.zeros({1, 3, 1}),
+      tf.zeros({1, 1, 4}),
+      tf.zeros({1, 1, 1}),
+      tf.zeros({2, 3, 4}),
+  };
+  // Writing out all the indexes would be too cumbersome, so here we
+  // take the opportunity to mutation test against delinearize_index
+  // and linearize_access_indexes.
+  int idx = 0;
+  for (const auto indexes : BroadcastIndexesRange<8>(
+           out,
+           input_tensors[0],
+           input_tensors[1],
+           input_tensors[2],
+           input_tensors[3],
+           input_tensors[4],
+           input_tensors[5],
+           input_tensors[6],
+           input_tensors[7])) {
+    const auto out_idx = indexes[0];
+    EXPECT_EQ(out_idx, idx++);
+    size_t out_indexes[executorch::runtime::kTensorDimensionLimit];
+    delinearize_index(
+        out_idx, out, out_indexes, executorch::runtime::kTensorDimensionLimit);
+    for (const auto tensor_idx : c10::irange(0, input_tensors.size())) {
+      EXPECT_EQ(
+          indexes[tensor_idx + 1],
+          linearize_access_indexes(
+              out_indexes, out.dim(), input_tensors[tensor_idx]));
+    }
+  }
+}
+
+// 4-D should generalize, but we will go ahead and test:
+// [N, 1, H, 1] -> [N, C, H, W]
+// [1, C, 1, W] -> [N, C, H, W]
+TEST(BroadcastIndexesRangeTest, FourDBroadcasting) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor out = tf.zeros({2, 3, 4, 5});
+  Tensor in_broadcast_cw = tf.zeros({2, 1, 4, 1});
+  Tensor in_broadcast_nh = tf.zeros({1, 3, 1, 5});
+
+  // Writing out all the indexes would be too cumbersome, so here we
+  // take the opportunity to mutation test against delinearize_index
+  // and linearize_access_indexes.
+  int idx = 0;
+  for (const auto [out_idx, in_cw_idx, in_nh_idx] :
+       BroadcastIndexesRange<2>(out, in_broadcast_cw, in_broadcast_nh)) {
+    EXPECT_EQ(out_idx, idx++);
+    size_t out_indexes[executorch::runtime::kTensorDimensionLimit];
+    delinearize_index(
+        out_idx, out, out_indexes, executorch::runtime::kTensorDimensionLimit);
+    EXPECT_EQ(
+        in_cw_idx,
+        linearize_access_indexes(out_indexes, out.dim(), in_broadcast_cw));
+    EXPECT_EQ(
+        in_nh_idx,
+        linearize_access_indexes(out_indexes, out.dim(), in_broadcast_nh));
+  }
+}
diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl
index 28988b90dcc..178eb25a79b 100644
--- a/kernels/portable/cpu/util/test/targets.bzl
+++ b/kernels/portable/cpu/util/test/targets.bzl
@@ -12,6 +12,17 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_test(
+        name = "broadcast_indexes_range_test",
+        srcs = ["broadcast_indexes_range_test.cpp"],
+        deps = [
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/kernels/portable/cpu/util:broadcast_indexes_range",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
+
     runtime.cxx_test(
         name = "reduce_test",
         srcs = ["reduce_test.cpp"],
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index 70cb2d2e44f..cc5e625f1e8 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -7,7 +7,8 @@
             "op_fast_hadamard_transform_test.cpp"
         ],
         "additional_libs": [
-            "custom_ops"
+            "custom_ops",
+            "dumb_fht"
         ]
     },
     {
@@ -61,6 +62,7 @@
     {
         "directory": "kernels/portable/cpu/util/test",
         "sources": [
+            "broadcast_indexes_range_test.cpp",
             "broadcast_test.cpp",
             "reduce_test.cpp"
         ],