From 3f789b8ed0e367c46eafc0ac44ff44cf444712ef Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 5 Nov 2025 10:44:58 -0800
Subject: [PATCH 01/28] [Executorch] parallelize op_choose_qparams

When doing prefill for quantized kv cache, with large prefill length, parallelizing this op helps.

Differential Revision: [D84962234](https://our.internmc.facebook.com/intern/diff/D84962234/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D84962234/)!

[ghstack-poisoned]
---
 kernels/quantized/cpu/op_choose_qparams.cpp   | 48 +++++++---
 kernels/quantized/cpu/targets.bzl             |  1 +
 .../quantized/test/op_choose_qparams_test.cpp | 95 +++++++++++++++++++
 3 files changed, 133 insertions(+), 11 deletions(-)
diff --git a/kernels/quantized/cpu/op_choose_qparams.cpp b/kernels/quantized/cpu/op_choose_qparams.cpp
index 5335f4bfbd2..acb8e100af6 100644
--- a/kernels/quantized/cpu/op_choose_qparams.cpp
+++ b/kernels/quantized/cpu/op_choose_qparams.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/kernels/portable/cpu/vec_ops.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <algorithm>
 #include <cinttypes>
 #include <cmath>
@@ -202,17 +203,42 @@ void choose_qparams_per_token(
     num_tokens *= input.size(i);
   }
   auto token_dim_size = input.size(input.dim() - 1);
-  for (auto i = 0; i < num_tokens; i++) {
-    // vec_minf uses std::min_element. Check if it actually
-    // gets vectorized.
-    float min = torch::executor::vec_minf(x_fp32, token_dim_size);
-    float max = torch::executor::vec_maxf(x_fp32, token_dim_size);
-    double scale;
-    int32_t zero_point;
-    calculate_scale_and_zero_point(min, max, qmin, qmax, scale, zero_point);
-    scale_out.mutable_data_ptr<double>()[i] = scale;
-    zero_point_out.mutable_data_ptr<int64_t>()[i] = zero_point;
-    x_fp32 += token_dim_size;
+
+  const int64_t total_elements = num_tokens * token_dim_size;
+  constexpr int64_t MIN_ELEMENTS_FOR_PARALLEL = 512;
+  const bool use_parallel = total_elements >= MIN_ELEMENTS_FOR_PARALLEL;
+
+  if (use_parallel) {
+    auto* scale_data = scale_out.mutable_data_ptr<double>();
+    auto* zero_point_data = zero_point_out.mutable_data_ptr<int64_t>();
+
+    ::executorch::extension::parallel_for(
+        0, num_tokens, 1, [&](const int64_t begin, const int64_t end) {
+          for (int64_t i = begin; i < end; i++) {
+            const float* token_data = x_fp32 + i * token_dim_size;
+            float min = torch::executor::vec_minf(token_data, token_dim_size);
+            float max = torch::executor::vec_maxf(token_data, token_dim_size);
+            double scale;
+            int32_t zero_point;
+            calculate_scale_and_zero_point(
+                min, max, qmin, qmax, scale, zero_point);
+            scale_data[i] = scale;
+            zero_point_data[i] = zero_point;
+          }
+        });
+  } else {
+    for (auto i = 0; i < num_tokens; i++) {
+      // vec_minf uses std::min_element. Check if it actually
+      // gets vectorized.
+      float min = torch::executor::vec_minf(x_fp32, token_dim_size);
+      float max = torch::executor::vec_maxf(x_fp32, token_dim_size);
+      double scale;
+      int32_t zero_point;
+      calculate_scale_and_zero_point(min, max, qmin, qmax, scale, zero_point);
+      scale_out.mutable_data_ptr<double>()[i] = scale;
+      zero_point_out.mutable_data_ptr<int64_t>()[i] = zero_point;
+      x_fp32 += token_dim_size;
+    }
   }
 }
 } // namespace
diff --git a/kernels/quantized/cpu/targets.bzl b/kernels/quantized/cpu/targets.bzl
index f29f1f013b7..1da0d482485 100644
--- a/kernels/quantized/cpu/targets.bzl
+++ b/kernels/quantized/cpu/targets.bzl
@@ -9,6 +9,7 @@ _QUANT_OPS = (
         name = "op_choose_qparams",
         deps = [
             "//executorch/kernels/portable/cpu:vec_ops",
+            "//executorch/extension/threadpool:threadpool",
         ],
     ),
     op_target(
diff --git a/kernels/quantized/test/op_choose_qparams_test.cpp b/kernels/quantized/test/op_choose_qparams_test.cpp
index 13426bfdd86..dc92df80488 100644
--- a/kernels/quantized/test/op_choose_qparams_test.cpp
+++ b/kernels/quantized/test/op_choose_qparams_test.cpp
@@ -15,6 +15,7 @@
 #include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
+#include <cmath>
 #include <limits>
 
 using namespace ::testing;
@@ -163,3 +164,97 @@ TEST(OpChooseQparamsPerTokenAsymmetricTensorOutTest, DynamicShapeFloat) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(scale_out, new_expected_scale, 1e-4, 1e-4);
   EXPECT_TENSOR_EQ(zero_point_out, new_expected_zero_point);
 }
+
+TEST(
+    OpChooseQparamsPerTokenAsymmetricTensorOutTest,
+    LargeInputParallelization) {
+  et_pal_init();
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // Create input with 8 tokens x 128 elements per token = 1024 total elements
+  // This exceeds the MIN_ELEMENTS_FOR_PARALLEL threshold of 512
+  const int num_tokens = 8;
+  const int token_size = 128;
+  std::vector<float> input_data(num_tokens * token_size);
+
+  // Generate test data with known min/max per token for easier verification
+  std::vector<float> expected_min(num_tokens);
+  std::vector<float> expected_max(num_tokens);
+
+  for (int i = 0; i < num_tokens; i++) {
+    float token_min = -1.0f * (i + 1);
+    float token_max = 2.0f * (i + 1);
+    expected_min[i] = token_min;
+    expected_max[i] = token_max;
+
+    for (int j = 0; j < token_size; j++) {
+      // Linearly interpolate between min and max
+      float t = j / static_cast<float>(token_size - 1);
+      input_data[i * token_size + j] = token_min + t * (token_max - token_min);
+    }
+  }
+
+  Tensor input = tf_float.make({num_tokens, token_size}, input_data);
+  Tensor scale_out = tf_double.zeros({num_tokens, 1});
+  Tensor zero_point_out = tf_long.zeros({num_tokens, 1});
+
+  choose_qparams_per_token_asymmetric_out(
+      input, ScalarType::Float, scale_out, zero_point_out);
+
+  // Manually calculate expected scale and zero_point using the same algorithm
+  // as calculate_scale_and_zero_point function
+  const int32_t qmin = -128;
+  const int32_t qmax = 127;
+  const float SMALL_SCALE_THRESHOLD = 6.1e-5f;
+
+  for (int i = 0; i < num_tokens; i++) {
+    float min = std::min(expected_min[i], 0.0f);
+    float max = std::max(expected_max[i], 0.0f);
+
+    // Calculate scale
+    double scale = (static_cast<double>(max) - min) / (qmax - qmin);
+    if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
+      scale = 0.1;
+    }
+
+    // Cut off small scale
+    if (scale < SMALL_SCALE_THRESHOLD) {
+      scale = SMALL_SCALE_THRESHOLD;
+      if (min == 0.0f) {
+        max = SMALL_SCALE_THRESHOLD * (qmax - qmin);
+      } else if (max == 0.0f) {
+        min = -SMALL_SCALE_THRESHOLD * (qmax - qmin);
+      } else {
+        float amplifier = SMALL_SCALE_THRESHOLD / scale;
+        min *= amplifier;
+        max *= amplifier;
+      }
+    }
+
+    // Calculate zero_point
+    double zero_point_from_min = qmin - min / scale;
+    double zero_point_from_max = qmax - max / scale;
+    double zero_point_from_min_error = std::abs(qmin) - std::abs(min / scale);
+    double zero_point_from_max_error = std::abs(qmax) - std::abs(max / scale);
+    double initial_zero_point =
+        zero_point_from_min_error < zero_point_from_max_error
+        ? zero_point_from_min
+        : zero_point_from_max;
+
+    int32_t nudged_zero_point = 0;
+    if (initial_zero_point < qmin) {
+      nudged_zero_point = qmin;
+    } else if (initial_zero_point > qmax) {
+      nudged_zero_point = qmax;
+    } else {
+      nudged_zero_point =
+          std::nearbyint(static_cast<float>(initial_zero_point));
+    }
+
+    // Verify computed values match expected
+    EXPECT_NEAR(scale_out.const_data_ptr<double>()[i], scale, 1e-6);
+    EXPECT_EQ(zero_point_out.const_data_ptr<int64_t>()[i], nudged_zero_point);
+  }
+}

From 08dd9803160c8e2a0f13ba8ec1e6259955a14c62 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 5 Nov 2025 10:45:02 -0800
Subject: [PATCH 02/28] [Executorch] Add simd path for op quantize

Reason this doesnt directly use Vectorize class is because the equivalent APIs dont exist in Vectorize class

Differential Revision: [D84962236](https://our.internmc.facebook.com/intern/diff/D84962236/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D84962236/)!

[ghstack-poisoned]
---
 kernels/quantized/cpu/op_quantize.cpp       | 294 +++++++++--
 kernels/quantized/test/op_quantize_test.cpp | 537 +++++++++++++++++++-
 2 files changed, 794 insertions(+), 37 deletions(-)

diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
index 5586f8a77eb..fefb07b1e59 100644
--- a/kernels/quantized/cpu/op_quantize.cpp
+++ b/kernels/quantized/cpu/op_quantize.cpp
@@ -11,6 +11,10 @@
 #include <cinttypes>
 #include <cmath>
 
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
 /**
  * For an input tensor, use the scale and zero_point arguments to quantize it.
  */
@@ -105,6 +109,143 @@ T quantize_val(
   return static_cast<T>(qvalue);
 }
 
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+
+// Traits for type-specific NEON operations
+template <typename T>
+struct NeonQuantizeTraits;
+
+template <>
+struct NeonQuantizeTraits<uint8_t> {
+  // Narrow int16x8 to uint8x8 with saturation (unsigned)
+  static inline uint8x8_t narrow_and_saturate(int16x8_t v) {
+    return vqmovun_s16(v);
+  }
+
+  // Store uint8x8 to memory
+  static inline void store(uint8_t* ptr, uint8x8_t v) {
+    vst1_u8(ptr, v);
+  }
+
+  // Scalar clamping for uint8
+  static inline uint8_t clamp_scalar(int32_t val) {
+    return static_cast<uint8_t>(std::min(255, std::max(0, val)));
+  }
+};
+
+template <>
+struct NeonQuantizeTraits<int8_t> {
+  // Narrow int16x8 to int8x8 with saturation (signed)
+  static inline int8x8_t narrow_and_saturate(int16x8_t v) {
+    return vqmovn_s16(v);
+  }
+
+  // Store int8x8 to memory
+  static inline void store(int8_t* ptr, int8x8_t v) {
+    vst1_s8(ptr, v);
+  }
+
+  // Scalar clamping for int8
+  static inline int8_t clamp_scalar(int32_t val) {
+    return static_cast<int8_t>(std::min(127, std::max(-128, val)));
+  }
+};
+
+// Unified ARM NEON optimized quantization for contiguous blocks
+// Processes N elements with a single scale/zero_point pair
+// Used for both per-tensor (entire tensor) and per-channel (one block per
+// channel)
+template <typename T>
+void quantize_arm(
+    const float* __restrict__ in,
+    T* __restrict__ out,
+    const int64_t N,
+    const float inv_scale,
+    const int32_t zero_point,
+    const int32_t quant_min,
+    const int32_t quant_max) {
+  using Traits = NeonQuantizeTraits<T>;
+  const float32x4_t vinv_scale = vdupq_n_f32(inv_scale);
+
+#if defined(__aarch64__)
+  // ARMv8: Use vcvtnq_s32_f32 for rounding
+  const int16x8_t vzero_point = vdupq_n_s16(static_cast<int16_t>(zero_point));
+  const int16x8_t vquant_min = vdupq_n_s16(static_cast<int16_t>(quant_min));
+  const int16x8_t vquant_max = vdupq_n_s16(static_cast<int16_t>(quant_max));
+
+  int64_t i = 0;
+  // Process 8 elements at a time
+  for (; i + 8 <= N; i += 8) {
+    const float32x4_t vin0123 = vld1q_f32(in + i);
+    const float32x4_t vin4567 = vld1q_f32(in + i + 4);
+
+    // Multiply by inv_scale and round
+    const int32x4_t v0123_rounded =
+        vcvtnq_s32_f32(vmulq_f32(vin0123, vinv_scale));
+    const int32x4_t v4567_rounded =
+        vcvtnq_s32_f32(vmulq_f32(vin4567, vinv_scale));
+
+    // Combine to int16 and add zero_point
+    int16x8_t v01234567_packed = vqaddq_s16(
+        vqmovn_high_s32(vqmovn_s32(v0123_rounded), v4567_rounded), vzero_point);
+
+    // Clamp to quant_min/quant_max
+    v01234567_packed = vmaxq_s16(v01234567_packed, vquant_min);
+    v01234567_packed = vminq_s16(v01234567_packed, vquant_max);
+
+    // Convert to T (int8/uint8) with saturation using type-specific operation
+    const auto vout01234567 = Traits::narrow_and_saturate(v01234567_packed);
+    Traits::store(out + i, vout01234567);
+  }
+
+  // Handle remaining elements with proper quant_min/quant_max clamping
+  for (; i < N; ++i) {
+    float val = in[i] * inv_scale;
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::max(quant_min, std::min(quant_max, qval));
+    out[i] = static_cast<T>(qval);
+  }
+
+#else
+  // ARMv7: Use magic float rounding
+  const int32x4_t voffset = vdupq_n_s32(zero_point - 0x4B400000);
+  const float32x4_t vmagic_float = vdupq_n_f32(12582912.0f);
+
+  int64_t i = 0;
+  // Process 8 elements at a time
+  for (; i + 8 <= N; i += 8) {
+    const float32x4_t vin0123 = vld1q_f32(in + i);
+    const float32x4_t vin4567 = vld1q_f32(in + i + 4);
+
+    const int32x4_t vraw0123 = vaddq_s32(
+        voffset,
+        vreinterpretq_s32_f32(
+            vaddq_f32(vmagic_float, vmulq_f32(vin0123, vinv_scale))));
+    const int32x4_t vraw4567 = vaddq_s32(
+        voffset,
+        vreinterpretq_s32_f32(
+            vaddq_f32(vmagic_float, vmulq_f32(vin4567, vinv_scale))));
+
+    const int16x8_t vraw01234567 =
+        vcombine_s16(vqmovn_s32(vraw0123), vqmovn_s32(vraw4567));
+
+    // Convert to T (int8/uint8) with saturation using type-specific operation
+    const auto vout01234567 = Traits::narrow_and_saturate(vraw01234567);
+    Traits::store(out + i, vout01234567);
+  }
+
+  // Handle remaining elements with proper quant_min/quant_max clamping
+  for (; i < N; ++i) {
+    float val = in[i] * inv_scale;
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::max(quant_min, std::min(quant_max, qval));
+    out[i] = static_cast<T>(qval);
+  }
+#endif
+}
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON__)
+
 Tensor& quantize_per_tensor_out(
     const Tensor& input,
     double scale,
@@ -120,19 +261,44 @@ Tensor& quantize_per_tensor_out(
 
   check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
 
-  // calculate the quantized input
-#define QUANTIZE_IMPL(IN_CTYPE, OUT_CTYPE, out_dtype)                          \
-  case ScalarType::out_dtype: {                                                \
-    /* Hoist these function calls out of our inner loop because they might not \
-     * get inlined without LTO, particularly in ATen mode. */                  \
-    auto* out_data_ptr = out.mutable_data_ptr<OUT_CTYPE>();                    \
-    const auto* input_data_ptr = input.const_data_ptr<IN_CTYPE>();             \
-    const auto input_numel = input.numel();                                    \
-    for (size_t i = 0; i < input_numel; i++) {                                 \
-      IN_CTYPE value = input_data_ptr[i];                                      \
-      out_data_ptr[i] = quantize_val<OUT_CTYPE, IN_CTYPE>(                     \
-          scale, zero_point, value, quant_min, quant_max);                     \
-    }                                                                          \
+  // Try ARM NEON optimized path for float->int8/uint8 quantization
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+  if (input.scalar_type() == ScalarType::Float) {
+    if (dtype == ScalarType::Byte) {
+      quantize_arm<uint8_t>(
+          input.const_data_ptr<float>(),
+          out.mutable_data_ptr<uint8_t>(),
+          input.numel(),
+          1.0f / static_cast<float>(scale),
+          static_cast<int32_t>(zero_point),
+          static_cast<int32_t>(quant_min),
+          static_cast<int32_t>(quant_max));
+      return out;
+    } else if (dtype == ScalarType::Char) {
+      quantize_arm<int8_t>(
+          input.const_data_ptr<float>(),
+          out.mutable_data_ptr<int8_t>(),
+          input.numel(),
+          1.0f / static_cast<float>(scale),
+          static_cast<int32_t>(zero_point),
+          static_cast<int32_t>(quant_min),
+          static_cast<int32_t>(quant_max));
+      return out;
+    }
+  }
+#endif
+
+  // Fallback scalar implementation for all other cases
+#define QUANTIZE_IMPL(IN_CTYPE, OUT_CTYPE, out_dtype)              \
+  case ScalarType::out_dtype: {                                    \
+    auto* out_data_ptr = out.mutable_data_ptr<OUT_CTYPE>();        \
+    const auto* input_data_ptr = input.const_data_ptr<IN_CTYPE>(); \
+    const auto input_numel = input.numel();                        \
+    for (size_t i = 0; i < input_numel; i++) {                     \
+      IN_CTYPE value = input_data_ptr[i];                          \
+      out_data_ptr[i] = quantize_val<OUT_CTYPE, IN_CTYPE>(         \
+          scale, zero_point, value, quant_min, quant_max);         \
+    }                                                              \
   } break;
 #define CALCULATE_FLOAT_TYPE(IN_CTYPE, in_dtype)         \
   case ScalarType::in_dtype:                             \
@@ -284,29 +450,85 @@ Tensor& quantize_per_channel_out(
   const double* scale_data = scale.const_data_ptr<double>();
   const int64_t* zero_point_data = zero_point.const_data_ptr<int64_t>();
 
-  // High-performance single loop with direct channel calculation
-#define QUANTIZE_IMPL(CTYPE_IN, CTYPE_OUT, out_dtype)                          \
-  case ScalarType::out_dtype: {                                                \
-    auto* out_data_ptr = out.mutable_data_ptr<CTYPE_OUT>();                    \
-    const auto* input_data_ptr = input.const_data_ptr<CTYPE_IN>();             \
-    const int64_t input_numel = input.numel();                                 \
-    const int64_t axis_size = input.size(axis);                                \
-    /* Calculate the stride pattern for efficient channel index calculation */ \
-    int64_t axis_block_size = 1;                                               \
-    for (int64_t i = axis + 1; i < input.dim(); i++) {                         \
-      axis_block_size *= input.size(i);                                        \
-    }                                                                          \
-    /* Single loop over all elements */                                        \
-    for (int64_t i = 0; i < input_numel; i++) {                                \
-      /* Calculate which channel this element belongs to */                    \
-      int64_t channel_idx = (i / axis_block_size) % axis_size;                 \
-      /* Get quantization parameters for this channel */                       \
-      double _scale = scale_data[channel_idx];                                 \
-      int64_t _zero_point = zero_point_data[channel_idx];                      \
-      /* Apply quantization */                                                 \
-      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(                     \
-          _scale, _zero_point, input_data_ptr[i], quant_min, quant_max);       \
-    }                                                                          \
+  // Calculate the block size for each channel
+  int64_t axis_block_size = 1;
+  for (int64_t i = axis + 1; i < input.dim(); i++) {
+    axis_block_size *= input.size(i);
+  }
+  const int64_t axis_size = input.size(axis);
+
+  // Try ARM NEON optimized path for float->int8/uint8 quantization
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+  if (input.scalar_type() == ScalarType::Float) {
+    const int64_t num_blocks = input.numel() / axis_block_size;
+
+    if (dtype == ScalarType::Byte) {
+      auto* out_data_ptr = out.mutable_data_ptr<uint8_t>();
+      const auto* input_data_ptr = input.const_data_ptr<float>();
+
+      // Process each contiguous block (which shares the same scale/zero_point)
+      for (int64_t block = 0; block < num_blocks; ++block) {
+        int64_t channel_idx = block % axis_size;
+        float inv_scale = 1.0f / static_cast<float>(scale_data[channel_idx]);
+        int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
+
+        const float* in_ptr = input_data_ptr + block * axis_block_size;
+        uint8_t* out_ptr = out_data_ptr + block * axis_block_size;
+
+        quantize_arm<uint8_t>(
+            in_ptr,
+            out_ptr,
+            axis_block_size,
+            inv_scale,
+            zp,
+            static_cast<int32_t>(quant_min),
+            static_cast<int32_t>(quant_max));
+      }
+      return out;
+    } else if (dtype == ScalarType::Char) {
+      auto* out_data_ptr = out.mutable_data_ptr<int8_t>();
+      const auto* input_data_ptr = input.const_data_ptr<float>();
+
+      // Process each contiguous block (which shares the same scale/zero_point)
+      for (int64_t block = 0; block < num_blocks; ++block) {
+        int64_t channel_idx = block % axis_size;
+        float inv_scale = 1.0f / static_cast<float>(scale_data[channel_idx]);
+        int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
+
+        const float* in_ptr = input_data_ptr + block * axis_block_size;
+        int8_t* out_ptr = out_data_ptr + block * axis_block_size;
+
+        quantize_arm<int8_t>(
+            in_ptr,
+            out_ptr,
+            axis_block_size,
+            inv_scale,
+            zp,
+            static_cast<int32_t>(quant_min),
+            static_cast<int32_t>(quant_max));
+      }
+      return out;
+    }
+  }
+#endif
+
+  // Fallback scalar implementation
+#define QUANTIZE_IMPL(CTYPE_IN, CTYPE_OUT, out_dtype)                    \
+  case ScalarType::out_dtype: {                                          \
+    auto* out_data_ptr = out.mutable_data_ptr<CTYPE_OUT>();              \
+    const auto* input_data_ptr = input.const_data_ptr<CTYPE_IN>();       \
+    const int64_t input_numel = input.numel();                           \
+    /* Single loop over all elements */                                  \
+    for (int64_t i = 0; i < input_numel; i++) {                          \
+      /* Calculate which channel this element belongs to */              \
+      int64_t channel_idx = (i / axis_block_size) % axis_size;           \
+      /* Get quantization parameters for this channel */                 \
+      double _scale = scale_data[channel_idx];                           \
+      int64_t _zero_point = zero_point_data[channel_idx];                \
+      /* Apply quantization */                                           \
+      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(               \
+          _scale, _zero_point, input_data_ptr[i], quant_min, quant_max); \
+    }                                                                    \
   } break;
 
 #define CALCULATE_FLOAT_TYPE(CTYPE_IN, in_dtype)         \
diff --git a/kernels/quantized/test/op_quantize_test.cpp b/kernels/quantized/test/op_quantize_test.cpp
index 4ac835c24ce..b450ec0ee33 100644
--- a/kernels/quantized/test/op_quantize_test.cpp
+++ b/kernels/quantized/test/op_quantize_test.cpp
@@ -14,7 +14,6 @@
 #include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
-#include <limits>
 
 using namespace ::testing;
 using executorch::aten::ArrayRef;
@@ -446,3 +445,539 @@ TEST(OpQuantizeOutTest, QuantizePerChannelClampingBehavior) {
 
   EXPECT_TENSOR_EQ(out, expected);
 }
+
+TEST(OpQuantizeOutTest, LargePerChannelClampingSIMDPath) {
+  // Test quant_min/quant_max clamping with large tensor to exercise SIMD path
+  // Shape: [3, 80] with axis=0 (3 channels, 80 elements each)
+  // 80 elements = 10 SIMD iterations (8 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int num_channels = 3;
+  const int block_size = 80;
+  std::vector<float> input_data(num_channels * block_size);
+
+  // Create input data with values that exceed quant_min/quant_max
+  for (int ch = 0; ch < num_channels; ch++) {
+    for (int i = 0; i < block_size; i++) {
+      // Generate values from -150 to 150 to test clamping
+      input_data[ch * block_size + i] =
+          static_cast<float>((i % 40) - 20) * 5.0f * (ch + 1);
+    }
+  }
+  Tensor input = tf_float.make({num_channels, block_size}, input_data);
+
+  // Use uniform scale and zero_point for all channels
+  Tensor scale = tf_double.make({num_channels}, {1.0, 1.0, 1.0});
+  Tensor zero_point = tf_long.make({num_channels}, {0, 0, 0});
+
+  // Set narrow quant_min/quant_max to force clamping
+  int64_t quant_min = -20;
+  int64_t quant_max = 20;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({num_channels, block_size});
+
+  // Compute expected values with clamping
+  std::vector<int8_t> expected_data(num_channels * block_size);
+  for (int ch = 0; ch < num_channels; ch++) {
+    double ch_scale = scale.const_data_ptr<double>()[ch];
+    int64_t ch_zero_point = zero_point.const_data_ptr<int64_t>()[ch];
+
+    for (int i = 0; i < block_size; i++) {
+      int idx = ch * block_size + i;
+      // Use double precision to avoid overflow
+      double val = static_cast<double>(input_data[idx]) / ch_scale;
+      // Clamp before converting to int to avoid overflow
+      val = std::max(-1000.0, std::min(1000.0, val));
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+          static_cast<int32_t>(ch_zero_point);
+      // Apply quant_min/quant_max clamping
+      qval = std::max(
+          static_cast<int32_t>(quant_min),
+          std::min(static_cast<int32_t>(quant_max), qval));
+      expected_data[idx] = static_cast<int8_t>(qval);
+    }
+  }
+  Tensor expected = tfo.make({num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 0, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+// Large tensor tests to ensure ARM NEON SIMD path is exercised
+
+TEST(OpQuantizeOutTest, LargeTensorUInt8SIMDPath) {
+  // Test with 64 elements to fully exercise SIMD path (8 elements per
+  // iteration)
+  TensorFactory<ScalarType::Float> tf_float;
+
+  // Create input with known values for verification
+  std::vector<float> input_data(64);
+  for (size_t i = 0; i < 64; i++) {
+    input_data[i] = static_cast<float>(i) * 0.5f; // 0.0, 0.5, 1.0, 1.5, ...
+  }
+  Tensor input = tf_float.make({64}, input_data);
+
+  double scale = 0.1;
+  int64_t zero_point = 10;
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({64});
+
+  // Compute expected values: round(value / scale) + zero_point
+  std::vector<uint8_t> expected_data(64);
+  for (size_t i = 0; i < 64; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(255, std::max(0, qval));
+    expected_data[i] = static_cast<uint8_t>(qval);
+  }
+  Tensor expected = tfo.make({64}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, LargeTensorInt8SIMDPath) {
+  // Test with 72 elements (9 SIMD iterations of 8) to test both vectorized and
+  // scalar paths
+  TensorFactory<ScalarType::Float> tf_float;
+
+  std::vector<float> input_data(72);
+  for (size_t i = 0; i < 72; i++) {
+    // Mix of positive and negative values
+    input_data[i] = static_cast<float>(static_cast<int>(i) - 36) * 0.25f;
+  }
+  Tensor input = tf_float.make({72}, input_data);
+
+  double scale = 0.2;
+  int64_t zero_point = 0;
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({72});
+
+  // Compute expected values
+  std::vector<int8_t> expected_data(72);
+  for (size_t i = 0; i < 72; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(127, std::max(-128, qval));
+    expected_data[i] = static_cast<int8_t>(qval);
+  }
+  Tensor expected = tfo.make({72}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, LargeTensorWithRemainderUInt8) {
+  // Test with 100 elements (12 SIMD iterations + 4 remainder) to test remainder
+  // handling
+  TensorFactory<ScalarType::Float> tf_float;
+
+  std::vector<float> input_data(100);
+  for (size_t i = 0; i < 100; i++) {
+    input_data[i] = static_cast<float>(i % 50) * 0.3f;
+  }
+  Tensor input = tf_float.make({100}, input_data);
+
+  double scale = 0.15;
+  int64_t zero_point = 128;
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({100});
+
+  std::vector<uint8_t> expected_data(100);
+  for (size_t i = 0; i < 100; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(255, std::max(0, qval));
+    expected_data[i] = static_cast<uint8_t>(qval);
+  }
+  Tensor expected = tfo.make({100}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, LargeTensorWithRemainderInt8) {
+  // Test with 99 elements (12 SIMD iterations + 3 remainder)
+  TensorFactory<ScalarType::Float> tf_float;
+
+  std::vector<float> input_data(99);
+  for (size_t i = 0; i < 99; i++) {
+    input_data[i] = std::sin(static_cast<float>(i) * 0.1f) * 10.0f;
+  }
+  Tensor input = tf_float.make({99}, input_data);
+
+  double scale = 0.1;
+  int64_t zero_point = 5;
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({99});
+
+  std::vector<int8_t> expected_data(99);
+  for (size_t i = 0; i < 99; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(127, std::max(-128, qval));
+    expected_data[i] = static_cast<int8_t>(qval);
+  }
+  Tensor expected = tfo.make({99}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, VeryLargeTensor2DUInt8) {
+  // Test with realistic 2D tensor size that would be used in neural networks
+  // 256x256 = 65536 elements (8192 SIMD iterations)
+  TensorFactory<ScalarType::Float> tf_float;
+
+  std::vector<float> input_data(256 * 256);
+  for (size_t i = 0; i < 256 * 256; i++) {
+    // Generate diverse values in a safe range
+    input_data[i] =
+        static_cast<float>((static_cast<int>(i % 256) - 128)) * 0.05f;
+  }
+  Tensor input = tf_float.make({256, 256}, input_data);
+
+  double scale = 0.05;
+  int64_t zero_point = 128;
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({256, 256});
+
+  // Compute expected values with proper overflow handling
+  std::vector<uint8_t> expected_data(256 * 256);
+  for (size_t i = 0; i < 256 * 256; i++) {
+    // Use double precision to avoid overflow
+    double val = static_cast<double>(input_data[i]) / scale;
+    // Clamp before converting to int to avoid overflow
+    val = std::max(-1000.0, std::min(1000.0, val));
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+        static_cast<int32_t>(zero_point);
+    qval = std::min(255, std::max(0, qval));
+    expected_data[i] = static_cast<uint8_t>(qval);
+  }
+  Tensor expected = tfo.make({256, 256}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, VeryLargeTensor3DInt8) {
+  // Test with 3D tensor (batch_size=2, height=64, width=128) = 16384 elements
+  TensorFactory<ScalarType::Float> tf_float;
+
+  const size_t total_elements = 2 * 64 * 128;
+  std::vector<float> input_data(total_elements);
+  for (size_t i = 0; i < total_elements; i++) {
+    input_data[i] = std::cos(static_cast<float>(i) * 0.01f) * 8.0f;
+  }
+  Tensor input = tf_float.make({2, 64, 128}, input_data);
+
+  double scale = 0.0625; // 1/16
+  int64_t zero_point = -10;
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({2, 64, 128});
+
+  std::vector<int8_t> expected_data(total_elements);
+  for (size_t i = 0; i < total_elements; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(127, std::max(-128, qval));
+    expected_data[i] = static_cast<int8_t>(qval);
+  }
+  Tensor expected = tfo.make({2, 64, 128}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, EdgeCaseSizesSIMD) {
+  // Test specific sizes around SIMD boundaries
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Byte> tfo;
+
+  double scale = 0.1;
+  int64_t zero_point = 100;
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  // Test sizes: 7 (just before SIMD), 8 (exactly 1 SIMD), 9 (1 SIMD + 1), 15,
+  // 16, 17
+  std::vector<size_t> test_sizes = {
+      7, 8, 9, 15, 16, 17, 23, 24, 25, 31, 32, 33};
+
+  for (size_t size : test_sizes) {
+    std::vector<float> input_data(size);
+    std::vector<uint8_t> expected_data(size);
+
+    for (size_t i = 0; i < size; i++) {
+      input_data[i] = static_cast<float>(i) * 0.3f;
+      float val = input_data[i] / static_cast<float>(scale);
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+      qval = std::min(255, std::max(0, qval));
+      expected_data[i] = static_cast<uint8_t>(qval);
+    }
+
+    Tensor input = tf_float.make({static_cast<int>(size)}, input_data);
+    Tensor out = tfo.zeros({static_cast<int>(size)});
+    Tensor expected = tfo.make({static_cast<int>(size)}, expected_data);
+
+    quantize_per_tensor_out(
+        input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+}
+
+// Large tensor tests for per-channel quantization to ensure SIMD path is
+// exercised
+
+TEST(OpQuantizeOutTest, LargePerChannelUInt8SIMDPath) {
+  // Test per-channel quantization with large blocks (64 elements per channel)
+  // Shape: [4, 64] with axis=1 (4 channels, 64 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int num_channels = 4;
+  const int block_size = 64;
+  std::vector<float> input_data(num_channels * block_size);
+
+  // Create varying input data for each channel
+  for (int ch = 0; ch < num_channels; ch++) {
+    for (int i = 0; i < block_size; i++) {
+      input_data[ch * block_size + i] = static_cast<float>((ch + 1) * i) * 0.1f;
+    }
+  }
+  Tensor input = tf_float.make({num_channels, block_size}, input_data);
+
+  // Different scale and zero_point for each channel
+  Tensor scale = tf_double.make({num_channels}, {0.1, 0.2, 0.15, 0.25});
+  Tensor zero_point = tf_long.make({num_channels}, {10, 20, 15, 25});
+
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({num_channels, block_size});
+
+  // Compute expected values
+  std::vector<uint8_t> expected_data(num_channels * block_size);
+  for (int ch = 0; ch < num_channels; ch++) {
+    double ch_scale = scale.const_data_ptr<double>()[ch];
+    int64_t ch_zero_point = zero_point.const_data_ptr<int64_t>()[ch];
+
+    for (int i = 0; i < block_size; i++) {
+      int idx = ch * block_size + i;
+      float val = input_data[idx] / static_cast<float>(ch_scale);
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+          static_cast<int32_t>(ch_zero_point);
+      qval = std::min(255, std::max(0, qval));
+      expected_data[idx] = static_cast<uint8_t>(qval);
+    }
+  }
+  Tensor expected = tfo.make({num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 0, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, LargePerChannelInt8SIMDPath) {
+  // Test per-channel quantization with int8 and large blocks
+  // Shape: [3, 100] with axis=1 (3 channels, 100 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int num_channels = 3;
+  const int block_size = 100; // 12 SIMD iterations + 4 remainder
+  std::vector<float> input_data(num_channels * block_size);
+
+  // Create varying input data with negative values
+  for (int ch = 0; ch < num_channels; ch++) {
+    for (int i = 0; i < block_size; i++) {
+      input_data[ch * block_size + i] =
+          static_cast<float>(i - 50) * 0.2f * (ch + 1);
+    }
+  }
+  Tensor input = tf_float.make({num_channels, block_size}, input_data);
+
+  Tensor scale = tf_double.make({num_channels}, {0.1, 0.15, 0.2});
+  Tensor zero_point = tf_long.make({num_channels}, {0, -5, 5});
+
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({num_channels, block_size});
+
+  // Compute expected values
+  std::vector<int8_t> expected_data(num_channels * block_size);
+  for (int ch = 0; ch < num_channels; ch++) {
+    double ch_scale = scale.const_data_ptr<double>()[ch];
+    int64_t ch_zero_point = zero_point.const_data_ptr<int64_t>()[ch];
+
+    for (int i = 0; i < block_size; i++) {
+      int idx = ch * block_size + i;
+      float val = input_data[idx] / static_cast<float>(ch_scale);
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+          static_cast<int32_t>(ch_zero_point);
+      qval = std::min(127, std::max(-128, qval));
+      expected_data[idx] = static_cast<int8_t>(qval);
+    }
+  }
+  Tensor expected = tfo.make({num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 0, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, VeryLargePerChannel2DUInt8) {
+  // Test realistic neural network weight tensor
+  // Shape: [128, 256] with axis=0 (128 channels, 256 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int num_channels = 128;
+  const int block_size = 256;
+  const int total_elements = num_channels * block_size;
+
+  std::vector<float> input_data(total_elements);
+  for (int i = 0; i < total_elements; i++) {
+    input_data[i] = std::sin(static_cast<float>(i) * 0.01f) * 5.0f;
+  }
+  Tensor input = tf_float.make({num_channels, block_size}, input_data);
+
+  // Create varying scales and zero_points for each channel
+  std::vector<double> scales(num_channels);
+  std::vector<int64_t> zero_points(num_channels);
+  for (int ch = 0; ch < num_channels; ch++) {
+    scales[ch] = 0.02 + (ch % 10) * 0.001; // Varying scales
+    zero_points[ch] = 128 + (ch % 5); // Varying zero_points
+  }
+  Tensor scale = tf_double.make({num_channels}, scales);
+  Tensor zero_point = tf_long.make({num_channels}, zero_points);
+
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({num_channels, block_size});
+
+  // Compute expected values
+  std::vector<uint8_t> expected_data(total_elements);
+  for (int ch = 0; ch < num_channels; ch++) {
+    float inv_scale = 1.0f / static_cast<float>(scales[ch]);
+    int64_t ch_zero_point = zero_points[ch];
+
+    for (int i = 0; i < block_size; i++) {
+      int idx = ch * block_size + i;
+      float val = input_data[idx] * inv_scale;
+      // Clamp before converting to avoid overflow
+      val = std::max(-1000.0f, std::min(1000.0f, val));
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+          static_cast<int32_t>(ch_zero_point);
+
+      qval = std::min(255, std::max(0, qval));
+      expected_data[idx] = static_cast<uint8_t>(qval);
+    }
+  }
+  Tensor expected = tfo.make({num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 0, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, PerChannelAxis1LargeBlocks) {
+  // Test per-channel quantization with axis=1 and large contiguous blocks
+  // Shape: [2, 3, 64] with axis=1 (2 batches, 3 channels, 64 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int batch_size = 2;
+  const int num_channels = 3;
+  const int block_size = 64;
+  const int total_elements = batch_size * num_channels * block_size;
+
+  std::vector<float> input_data(total_elements);
+  for (int i = 0; i < total_elements; i++) {
+    input_data[i] = static_cast<float>(i % 100) * 0.1f;
+  }
+  Tensor input =
+      tf_float.make({batch_size, num_channels, block_size}, input_data);
+
+  Tensor scale = tf_double.make({num_channels}, {0.05, 0.1, 0.15});
+  Tensor zero_point = tf_long.make({num_channels}, {100, 110, 120});
+
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({batch_size, num_channels, block_size});
+
+  // Compute expected values
+  std::vector<uint8_t> expected_data(total_elements);
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < num_channels; ch++) {
+      double ch_scale = scale.const_data_ptr<double>()[ch];
+      int64_t ch_zero_point = zero_point.const_data_ptr<int64_t>()[ch];
+
+      for (int i = 0; i < block_size; i++) {
+        int idx = (b * num_channels + ch) * block_size + i;
+        float val = input_data[idx] / static_cast<float>(ch_scale);
+        int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+            static_cast<int32_t>(ch_zero_point);
+        qval = std::min(255, std::max(0, qval));
+        expected_data[idx] = static_cast<uint8_t>(qval);
+      }
+    }
+  }
+  Tensor expected =
+      tfo.make({batch_size, num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 1, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}

From 27fc8b1fb673dcba42b22f3a3ee79f4fd2da9343 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 5 Nov 2025 10:45:07 -0800
Subject: [PATCH 03/28] [Executorch] Add multithreading for op_quantize

As the title

Differential Revision: [D84962233](https://our.internmc.facebook.com/intern/diff/D84962233/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D84962233/)!

[ghstack-poisoned]
---
 kernels/quantized/cpu/op_quantize.cpp | 122 +++++++++++++++++++-------
 kernels/quantized/cpu/targets.bzl     |   3 +
 2 files changed, 91 insertions(+), 34 deletions(-)

diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
index fefb07b1e59..e52b9a371e6 100644
--- a/kernels/quantized/cpu/op_quantize.cpp
+++ b/kernels/quantized/cpu/op_quantize.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <algorithm>
 #include <cinttypes>
 #include <cmath>
@@ -461,51 +462,104 @@ Tensor& quantize_per_channel_out(
 #if defined(__aarch64__) || defined(__ARM_NEON__)
   if (input.scalar_type() == ScalarType::Float) {
     const int64_t num_blocks = input.numel() / axis_block_size;
+    const int64_t total_elements = input.numel();
+    constexpr int64_t MIN_ELEMENTS_FOR_PARALLEL = 512;
+    const bool use_parallel = (total_elements >= MIN_ELEMENTS_FOR_PARALLEL);
 
     if (dtype == ScalarType::Byte) {
       auto* out_data_ptr = out.mutable_data_ptr<uint8_t>();
       const auto* input_data_ptr = input.const_data_ptr<float>();
 
-      // Process each contiguous block (which shares the same scale/zero_point)
-      for (int64_t block = 0; block < num_blocks; ++block) {
-        int64_t channel_idx = block % axis_size;
-        float inv_scale = 1.0f / static_cast<float>(scale_data[channel_idx]);
-        int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
-
-        const float* in_ptr = input_data_ptr + block * axis_block_size;
-        uint8_t* out_ptr = out_data_ptr + block * axis_block_size;
-
-        quantize_arm<uint8_t>(
-            in_ptr,
-            out_ptr,
-            axis_block_size,
-            inv_scale,
-            zp,
-            static_cast<int32_t>(quant_min),
-            static_cast<int32_t>(quant_max));
+      if (use_parallel) {
+        ::executorch::extension::parallel_for(
+            0, num_blocks, 1, [&](const int64_t begin, const int64_t end) {
+              for (int64_t block = begin; block < end; ++block) {
+                int64_t channel_idx = block % axis_size;
+                float inv_scale =
+                    1.0f / static_cast<float>(scale_data[channel_idx]);
+                int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
+
+                const float* in_ptr = input_data_ptr + block * axis_block_size;
+                uint8_t* out_ptr = out_data_ptr + block * axis_block_size;
+
+                quantize_arm<uint8_t>(
+                    in_ptr,
+                    out_ptr,
+                    axis_block_size,
+                    inv_scale,
+                    zp,
+                    static_cast<int32_t>(quant_min),
+                    static_cast<int32_t>(quant_max));
+              }
+            });
+      } else {
+        // Process each contiguous block (which shares the same
+        // scale/zero_point)
+        for (int64_t block = 0; block < num_blocks; ++block) {
+          int64_t channel_idx = block % axis_size;
+          float inv_scale = 1.0f / static_cast<float>(scale_data[channel_idx]);
+          int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
+
+          const float* in_ptr = input_data_ptr + block * axis_block_size;
+          uint8_t* out_ptr = out_data_ptr + block * axis_block_size;
+
+          quantize_arm<uint8_t>(
+              in_ptr,
+              out_ptr,
+              axis_block_size,
+              inv_scale,
+              zp,
+              static_cast<int32_t>(quant_min),
+              static_cast<int32_t>(quant_max));
+        }
       }
       return out;
     } else if (dtype == ScalarType::Char) {
       auto* out_data_ptr = out.mutable_data_ptr<int8_t>();
       const auto* input_data_ptr = input.const_data_ptr<float>();
 
-      // Process each contiguous block (which shares the same scale/zero_point)
-      for (int64_t block = 0; block < num_blocks; ++block) {
-        int64_t channel_idx = block % axis_size;
-        float inv_scale = 1.0f / static_cast<float>(scale_data[channel_idx]);
-        int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
-
-        const float* in_ptr = input_data_ptr + block * axis_block_size;
-        int8_t* out_ptr = out_data_ptr + block * axis_block_size;
-
-        quantize_arm<int8_t>(
-            in_ptr,
-            out_ptr,
-            axis_block_size,
-            inv_scale,
-            zp,
-            static_cast<int32_t>(quant_min),
-            static_cast<int32_t>(quant_max));
+      if (use_parallel) {
+        ::executorch::extension::parallel_for(
+            0, num_blocks, 1, [&](const int64_t begin, const int64_t end) {
+              for (int64_t block = begin; block < end; ++block) {
+                int64_t channel_idx = block % axis_size;
+                float inv_scale =
+                    1.0f / static_cast<float>(scale_data[channel_idx]);
+                int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
+
+                const float* in_ptr = input_data_ptr + block * axis_block_size;
+                int8_t* out_ptr = out_data_ptr + block * axis_block_size;
+
+                quantize_arm<int8_t>(
+                    in_ptr,
+                    out_ptr,
+                    axis_block_size,
+                    inv_scale,
+                    zp,
+                    static_cast<int32_t>(quant_min),
+                    static_cast<int32_t>(quant_max));
+              }
+            });
+      } else {
+        // Process each contiguous block (which shares the same
+        // scale/zero_point)
+        for (int64_t block = 0; block < num_blocks; ++block) {
+          int64_t channel_idx = block % axis_size;
+          float inv_scale = 1.0f / static_cast<float>(scale_data[channel_idx]);
+          int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
+
+          const float* in_ptr = input_data_ptr + block * axis_block_size;
+          int8_t* out_ptr = out_data_ptr + block * axis_block_size;
+
+          quantize_arm<int8_t>(
+              in_ptr,
+              out_ptr,
+              axis_block_size,
+              inv_scale,
+              zp,
+              static_cast<int32_t>(quant_min),
+              static_cast<int32_t>(quant_max));
+        }
       }
       return out;
     }
diff --git a/kernels/quantized/cpu/targets.bzl b/kernels/quantized/cpu/targets.bzl
index 1da0d482485..88a3823c5f3 100644
--- a/kernels/quantized/cpu/targets.bzl
+++ b/kernels/quantized/cpu/targets.bzl
@@ -52,6 +52,9 @@ _QUANT_OPS = (
     ),
     op_target(
         name = "op_quantize",
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+        ],
     ),
 )
 

From ae61ab48e405a957a6b8b164c1a629d403935271 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 5 Nov 2025 10:45:11 -0800
Subject: [PATCH 04/28] Reduce allocation overhead in quantized sdpa

For small models dequantizing portions of v cache causes extra alloc overhead.

Probably a better way to handle this is to dequantize entire v cache outside the model

There isnt significant perf advantage from this yet but subsequent diffs will use caching allocator where this refactor help.

Differential Revision: [D85532077](https://our.internmc.facebook.com/intern/diff/D85532077/)

[ghstack-poisoned]
---
 extension/llm/custom_ops/op_sdpa_impl.h | 27 ++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
index e0a81c4650c..07ce16dd048 100644
--- a/extension/llm/custom_ops/op_sdpa_impl.h
+++ b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -213,13 +213,13 @@ void dequant_and_gemm(
     const int64_t v_stride_n,
     float* o_data,
     const int64_t o_stride_m,
-    const float beta) {
-  std::vector<float> dequantized_v_data(v_data.m * v_data.n);
+    const float beta,
+    float* buf_qdq_ptr) {
   dequantize_per_channel_optimized(
       static_cast<const int8_t*>(v_data.data),
       static_cast<const float*>(v_data.scales),
       static_cast<const int8_t*>(v_data.zero_points),
-      dequantized_v_data.data(),
+      buf_qdq_ptr,
       -128,
       127,
       1,
@@ -237,7 +237,7 @@ void dequant_and_gemm(
       m,
       k,
       static_cast<float>(1),
-      dequantized_v_data.data(),
+      buf_qdq_ptr,
       v_data.n,
       qk_data,
       qk_stride_m,
@@ -257,7 +257,8 @@ void _qk_at_v_gemm(
     const int64_t v_stride_n,
     accum_t* o_data,
     const int64_t o_stride_m,
-    const accum_t beta) {
+    const accum_t beta,
+    accum_t* buf_qdq_ptr) {
   if (v_data.dtype == ScalarType::Char) {
     if constexpr (std::is_same<accum_t, float>::value) {
       if (m > 4) {
@@ -273,7 +274,8 @@ void _qk_at_v_gemm(
             v_stride_n,
             o_data,
             o_stride_m,
-            beta);
+            beta,
+            buf_qdq_ptr);
       } else {
         // For smaller batch sizes, use quantized gemm
         int a_stride_m_tmp, b_stride_n_tmp;
@@ -773,6 +775,15 @@ void cpu_flash_attention(
   // at::Tensor buf_reduced = at::empty(
   //    {num_thread, qSplitSize, is_reduced_type ? kvSplitSize : 0},
   //    query.options());
+  int64_t size_per_thread_qdq_vec = qSplitSize * kvSplitSize * headSize;
+  // Lets align size_per_thread_qdq_vec to 64 bytes, for coalesced cache reads,
+  // by padding with right number of per thread elements
+  constexpr int64_t kAlignment = 32;
+  size_per_thread_qdq_vec = (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
+  int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * query.element_size();
+  int64_t size_qdq_bytes = size_per_thread_qdq_bytes * num_thread;
+  std::vector<char> scratch_for_quant_dequant_vec(size_qdq_bytes);
+  accum_t* scratch_for_quant_dequant = reinterpret_cast<accum_t*>(scratch_for_quant_dequant_vec.data());
 
   // Data ptrs
   const scalar_t* q_data = query.const_data_ptr<scalar_t>();
@@ -797,6 +808,7 @@ void cpu_flash_attention(
     scalar_t* qk_reduced_data = is_reduced_type
         ? buf_reduced_data + ompIdx * qSplitSize * kvSplitSize
         : nullptr;
+    accum_t* buf_qdq_ptr = scratch_for_quant_dequant + ompIdx * size_per_thread_qdq_vec;
 
     for (int64_t z = begin; z < end; z++) {
       int64_t m = k * qSplitSize;
@@ -1053,7 +1065,8 @@ void cpu_flash_attention(
             vStrideN,
             dst_data,
             headSize,
-            n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1));
+            n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1),
+            buf_qdq_ptr);
       }
       // dst <- dst / sum[row]
       // reorder MHA output with strides

From ea16e152f7e562f51ceb96bb7ab3a68217c97446 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 5 Nov 2025 10:45:15 -0800
Subject: [PATCH 05/28] [Executorch] Introduce caching cpu memory allocator

Meant to use this for temp allocator for kernels. Specifically for sdpa, it seems that on iOS there is a significant overhead coming from allocations

Differential Revision: [D85532079](https://our.internmc.facebook.com/intern/diff/D85532079/)

[ghstack-poisoned]
---
 extension/memory_allocator/CMakeLists.txt     |  48 +++
 .../cpu_caching_malloc_allocator.cpp          |  88 +++++
 .../cpu_caching_malloc_allocator.h            |  81 +++++
 extension/memory_allocator/targets.bzl        |  17 +
 .../cpu_caching_malloc_allocator_test.cpp     | 303 ++++++++++++++++++
 extension/memory_allocator/test/targets.bzl   |  10 +
 .../executorch/build/build_variables.bzl      |   4 +
 tools/cmake/Codegen.cmake                     |   2 +
 8 files changed, 553 insertions(+)
 create mode 100644 extension/memory_allocator/CMakeLists.txt
 create mode 100644 extension/memory_allocator/cpu_caching_malloc_allocator.cpp
 create mode 100644 extension/memory_allocator/cpu_caching_malloc_allocator.h
 create mode 100644 extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp

diff --git a/extension/memory_allocator/CMakeLists.txt b/extension/memory_allocator/CMakeLists.txt
new file mode 100644
index 00000000000..1c3c8a0831c
--- /dev/null
+++ b/extension/memory_allocator/CMakeLists.txt
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+list(TRANSFORM _extension_module__srcs PREPEND "${EXECUTORCH_ROOT}/")
+if(CMAKE_TOOLCHAIN_IOS
+   OR CMAKE_TOOLCHAIN_ANDROID
+   OR APPLE
+)
+  # Building a share library on iOS requires code signing On Android we see
+  # duplicated registration when using shared lib
+  add_library(extension_memory_allocator STATIC ${_extension_memory_allocator__srcs})
+else()
+  add_library(extension_memory_allocator ${_extension_memory_allocator__srcs})
+endif()
+target_link_libraries(
+  extension_memory_allocator PRIVATE executorch_core)
+target_include_directories(
+  extension_memory_allocator PUBLIC ${_common_include_directories}
+)
+target_compile_options(
+  extension_memory_allocator
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
+
+# Install libraries
+install(
+  TARGETS extension_memory_allocator
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
new file mode 100644
index 00000000000..11ecc5b326e
--- /dev/null
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
@@ -0,0 +1,88 @@
+#include <cstdlib>
+
+#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
+
+namespace executorch::extension {
+
+namespace {
+size_t get_alignment_adjusted_size(size_t size, size_t alignment) {
+  alignment = std::max(alignment, kDefaultAlignment);
+  if (size % alignment != 0) {
+    // Adjust size to the next multiple of alignment
+    // This is needed for aligned_alloc to work
+    return (size + alignment) & ~(alignment - 1);
+  } else {
+    return size;
+  }
+}
+} // namespace
+
+CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size) : MemoryAllocator(0, nullptr) {
+  max_size_ = max_size;
+  current_size_ = 0;
+}
+
+void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
+  EXECUTORCH_TRACK_ALLOCATION(prof_id(), size);
+
+  if (!isPowerOf2(alignment)) {
+    ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
+    return nullptr;
+  }
+  size = get_alignment_adjusted_size(size, alignment);
+
+  std::lock_guard<std::mutex> guard(mutex_);
+  const auto& it = available_map_.find(size);
+  if (it == available_map_.end() || it->second.empty()) {
+    if (current_size_ + size > max_size_) {
+      // Freeing while holding the lock will cause performance issues
+      // we probably should log how often this happens so as to allow
+      // for calling site to adjust the max_size_ parameter
+      free_cached();
+    }
+    void* ptr = std::aligned_alloc(alignment, size);
+    current_size_ += size;
+    if (ptr == nullptr) {
+      ET_LOG(Error, "Failed to allocate memory");
+      return nullptr;
+    }
+    allocation_map_[ptr] = size;
+    return ptr;
+  }
+  void* ptr = it->second.back();
+  it->second.pop_back();
+  allocation_map_[ptr] = size;
+  return ptr;
+}
+
+void CPUCachingAllocator::free_cached() {
+  // We dont lock mutex_ here because it will cause deadlock otherwise
+  // we could use recursive_mutex but we just design this differently since
+  // free_cache is not a public API anyways
+  for (const auto& it : available_map_) {
+    for (const auto ptr : it.second) {
+      std::free(ptr);
+    }
+  }
+  available_map_.clear();
+}
+
+void CPUCachingAllocator::reset() {
+  std::lock_guard<std::mutex> guard(mutex_);
+  for (auto& it : allocation_map_) {
+    void* ptr = it.first;
+    size_t alloc_size = it.second;
+    // Cache the memory
+    available_map_[alloc_size].push_back(ptr);
+    current_size_ -= alloc_size;
+  }
+  allocation_map_.clear();
+}
+
+CPUCachingAllocator::~CPUCachingAllocator() {
+  // destructor must be called in thread safe manner
+  reset();
+  free_cached();
+}
+
+} // namespace executorch::extension
diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.h b/extension/memory_allocator/cpu_caching_malloc_allocator.h
new file mode 100644
index 00000000000..d5818020a05
--- /dev/null
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <cstddef>
+#include <mutex>
+
+#include <executorch/runtime/core/memory_allocator.h>
+
+#ifdef USE_C10_SMALL_VECTOR
+#include <c10/util/SmallVector.h>
+#else
+#include <vector>
+#endif
+
+#ifdef USE_C10_FLAT_HASH_MAP
+#include <c10/util/flat_hash_map.h>
+#else
+#include <unordered_map>
+#endif
+
+/*
+ * CPUCachingAllocator:
+ * This file is copied over from c10/mobile/CPUCachingAllocator.h
+ * It is a thread safe caching allocator.
+ */
+
+namespace executorch::extension {
+
+#ifdef USE_C10_SMALL_VECTOR
+template <typename T, unsigned N>
+using SmallVector = c10::SmallVector<T, N>;
+#else
+template <typename T, unsigned N>
+using SmallVector = std::vector<T>;
+#endif
+
+#ifdef USE_C10_FLAT_HASH_MAP
+template<typename KeyType, typename ValueType>
+using FlatHashMap = ska::flat_hash_map<KeyType, ValueType>;
+#else
+template<typename KeyType, typename ValueType>
+using FlatHashMap = std::unordered_map<KeyType, ValueType>;
+#endif
+
+constexpr size_t kDefaultAlignment = 64;
+class CPUCachingAllocator : public executorch::runtime::MemoryAllocator {
+  /*
+   * What it does:
+   * Caches all the allocations carried out by this allocator.
+   * Cache key is the size of the allocation.
+   * If requested size is found in the cache returns the cached pointer.
+   * What it does not do:
+   * No speculative allocation for any future allocations.
+   */
+ private:
+  void free_cached();
+
+ protected:
+  // Invariants.
+  // New invariants must be written.
+  FlatHashMap<size_t, SmallVector<void*, 16>> available_map_;
+  FlatHashMap<void*, size_t> allocation_map_;
+  // Since allocation_map, which is a global instance, is mutated/read via
+  // all public APIs we need a global mutex.
+  std::mutex mutex_;
+  size_t max_size_;
+  size_t current_size_;
+
+ public:
+  /*
+    max_size: Maximum size of memory to cache. Never cache more than that.
+  */
+  CPUCachingAllocator(uint32_t max_size);
+  // Checks the cache to see if allocation of size bytes can be found.
+  // If so return cached memory, else
+  // allocates memory, records it for caching and returns.
+  void* allocate(size_t size, size_t alignment = kDefaultAlignment) override;
+  void reset() override;
+  ~CPUCachingAllocator();
+};
+
+} // namespace executorch::extension
diff --git a/extension/memory_allocator/targets.bzl b/extension/memory_allocator/targets.bzl
index d021a4da707..f51b084e4ee 100644
--- a/extension/memory_allocator/targets.bzl
+++ b/extension/memory_allocator/targets.bzl
@@ -20,3 +20,20 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
+
+    runtime.cxx_library(
+        name = "cpu_caching_allocator",
+        srcs = [
+            "cpu_caching_malloc_allocator.cpp",
+        ],
+        exported_headers = [
+            "cpu_caching_malloc_allocator.h",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core:memory_allocator",
+        ],
+        visibility = [
+            "//executorch/extension/memory_allocator/test/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
diff --git a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
new file mode 100644
index 00000000000..94929e127e9
--- /dev/null
+++ b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
@@ -0,0 +1,303 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <thread>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
+#include <executorch/runtime/platform/runtime.h>
+
+using namespace ::testing;
+using executorch::extension::CPUCachingAllocator;
+
+constexpr auto kDefaultAlignment = executorch::extension::kDefaultAlignment;
+
+class CPUCachingAllocatorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+  }
+};
+
+bool is_aligned(const void* ptr, size_t alignment) {
+  uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
+  return addr % alignment == 0;
+}
+
+#define EXPECT_ALIGNED(ptr, alignment)        \
+  EXPECT_TRUE(is_aligned((ptr), (alignment))) \
+      << "Pointer " << (ptr) << " is not aligned to " << (alignment)
+
+TEST_F(CPUCachingAllocatorTest, SimpleAllocateSucceeds) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  auto p = allocator.allocate(16);
+  EXPECT_NE(p, nullptr);
+  EXPECT_ALIGNED(p, kDefaultAlignment);
+
+  auto p2 = allocator.allocate(32);
+  EXPECT_NE(p2, nullptr);
+  EXPECT_ALIGNED(p2, kDefaultAlignment);
+
+  auto p3 = allocator.allocate(64);
+  EXPECT_NE(p3, nullptr);
+  EXPECT_ALIGNED(p3, kDefaultAlignment);
+}
+
+TEST_F(CPUCachingAllocatorTest, CachingReusesSameSize) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  auto p1 = allocator.allocate(256);
+  EXPECT_NE(p1, nullptr);
+  EXPECT_ALIGNED(p1, kDefaultAlignment);
+
+  // Reset to return the allocation to the cache
+  allocator.reset();
+
+  // Allocate the same size should reuse the cached pointer
+  auto p2 = allocator.allocate(256);
+  EXPECT_EQ(p1, p2);
+  EXPECT_ALIGNED(p2, kDefaultAlignment);
+}
+
+TEST_F(CPUCachingAllocatorTest, DifferentSizesAllocateDifferentPtrs) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  auto p1 = allocator.allocate(128);
+  auto p2 = allocator.allocate(256);
+  auto p3 = allocator.allocate(512);
+
+  EXPECT_NE(p1, nullptr);
+  EXPECT_NE(p2, nullptr);
+  EXPECT_NE(p3, nullptr);
+
+  // All pointers should be different
+  EXPECT_NE(p1, p2);
+  EXPECT_NE(p2, p3);
+  EXPECT_NE(p1, p3);
+
+  EXPECT_ALIGNED(p1, kDefaultAlignment);
+  EXPECT_ALIGNED(p2, kDefaultAlignment);
+  EXPECT_ALIGNED(p3, kDefaultAlignment);
+}
+
+TEST_F(CPUCachingAllocatorTest, ResetCachesAllocations) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  auto p1 = allocator.allocate(256);
+  auto p2 = allocator.allocate(256);
+  EXPECT_NE(p1, p2);
+
+  allocator.reset();
+
+  // After reset, both cached allocations should be available
+  auto p3 = allocator.allocate(256);
+  auto p4 = allocator.allocate(256);
+
+  // p3 should be one of the cached pointers (either p1 or p2)
+  EXPECT_TRUE((p3 == p1) || (p3 == p2));
+  EXPECT_TRUE((p4 == p1) || (p4 == p2));
+  EXPECT_NE(p3, p4);
+}
+
+TEST_F(CPUCachingAllocatorTest, AlignmentParameter) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  std::vector<size_t> alignments = {
+      kDefaultAlignment,
+      kDefaultAlignment * 2,
+      kDefaultAlignment * 4,
+      kDefaultAlignment * 8,
+  };
+
+  for (size_t alignment : alignments) {
+    auto p = allocator.allocate(256, alignment);
+    EXPECT_NE(p, nullptr);
+    EXPECT_ALIGNED(p, alignment);
+  }
+}
+
+TEST_F(CPUCachingAllocatorTest, InvalidAlignmentFails) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  // Should fail because alignment is not a power of 2
+  std::vector<size_t> invalid_alignments = {0, 5, 6, 12, 34};
+  for (auto alignment : invalid_alignments) {
+    auto p = allocator.allocate(256, alignment);
+    EXPECT_EQ(p, nullptr);
+  }
+}
+
+TEST_F(CPUCachingAllocatorTest, MaxSizeRespected) {
+  constexpr size_t kMaxSize = 1024; // 1KB max
+  CPUCachingAllocator allocator(kMaxSize);
+
+  // Allocate close to the max size
+  auto p1 = allocator.allocate(512);
+  EXPECT_NE(p1, nullptr);
+
+  auto p2 = allocator.allocate(512);
+  EXPECT_NE(p2, nullptr);
+
+  // This should trigger cache freeing since we would exceed max_size
+  auto p3 = allocator.allocate(512);
+  EXPECT_NE(p3, nullptr);
+}
+
+TEST_F(CPUCachingAllocatorTest, MultipleAllocationsAndResets) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  for (int i = 0; i < 5; ++i) {
+    auto p1 = allocator.allocate(256);
+    auto p2 = allocator.allocate(512);
+    auto p3 = allocator.allocate(1024);
+
+    EXPECT_NE(p1, nullptr);
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p3, nullptr);
+
+    allocator.reset();
+  }
+}
+
+TEST_F(CPUCachingAllocatorTest, MemoryWriteability) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  const size_t size = 1024;
+  auto p = allocator.allocate(size);
+  EXPECT_NE(p, nullptr);
+
+  // Write to allocated memory
+  memset(p, 0x55, size);
+
+  // Read back and verify
+  uint8_t* bytes = reinterpret_cast<uint8_t*>(p);
+  for (size_t i = 0; i < size; ++i) {
+    EXPECT_EQ(bytes[i], 0x55);
+  }
+
+  allocator.reset();
+}
+
+TEST_F(CPUCachingAllocatorTest, CachingWithMultipleSizes) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  // Allocate various sizes
+  auto p1 = allocator.allocate(128);
+  auto p2 = allocator.allocate(256);
+  auto p3 = allocator.allocate(512);
+  auto p4 = allocator.allocate(128);
+
+  // Reset to cache them
+  allocator.reset();
+
+  // Allocate same sizes - should reuse cached pointers
+  auto p5 = allocator.allocate(128);
+  auto p6 = allocator.allocate(256);
+  auto p7 = allocator.allocate(512);
+
+  EXPECT_TRUE((p5 == p1) || (p5 == p4));
+  EXPECT_EQ(p6, p2);
+  EXPECT_EQ(p7, p3);
+}
+
+TEST_F(CPUCachingAllocatorTest, ThreadSafety) {
+  CPUCachingAllocator allocator(4 * 1024 * 1024); // 4MB max size
+
+  std::vector<std::thread> threads;
+  std::vector<void*> allocated_ptrs;
+  std::mutex ptrs_mutex;
+
+  const int num_threads = 4;
+  const int allocations_per_thread = 10;
+
+  // Lambda function for thread work
+  auto thread_work = [&]() {
+    for (int i = 0; i < allocations_per_thread; ++i) {
+      size_t size = (i + 1) * 64;
+      auto p = allocator.allocate(size);
+      EXPECT_NE(p, nullptr);
+      EXPECT_ALIGNED(p, kDefaultAlignment);
+
+      {
+        std::lock_guard<std::mutex> guard(ptrs_mutex);
+        allocated_ptrs.push_back(p);
+      }
+    }
+
+    // Reset in each thread
+    allocator.reset();
+  };
+
+  // Create threads
+  for (int i = 0; i < num_threads; ++i) {
+    threads.emplace_back(thread_work);
+  }
+
+  // Wait for all threads to finish
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  // Verify all allocations were valid
+  EXPECT_EQ(allocated_ptrs.size(), num_threads * allocations_per_thread);
+}
+
+TEST_F(CPUCachingAllocatorTest, LargeAllocation) {
+  CPUCachingAllocator allocator(10 * 1024 * 1024); // 10MB max size
+
+  const size_t large_size = 1024 * 1024; // 1MB allocation
+  auto p = allocator.allocate(large_size);
+  EXPECT_NE(p, nullptr);
+  EXPECT_ALIGNED(p, kDefaultAlignment);
+
+  // Write and verify
+  memset(p, 0xAA, large_size);
+  uint8_t* bytes = reinterpret_cast<uint8_t*>(p);
+  for (size_t i = 0; i < 1000; ++i) { // Sample check
+    EXPECT_EQ(bytes[i], 0xAA);
+  }
+
+  allocator.reset();
+
+  // Re-allocate same size should reuse cached pointer
+  auto p2 = allocator.allocate(large_size);
+  EXPECT_EQ(p, p2);
+}
+
+TEST_F(CPUCachingAllocatorTest, SizeAlignmentAdjustment) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  // Test that allocation sizes get properly aligned
+  auto p1 = allocator.allocate(100, 256); // Size not aligned to 256
+  EXPECT_NE(p1, nullptr);
+  EXPECT_ALIGNED(p1, 256);
+
+  auto p2 = allocator.allocate(100, 256);
+  // Should not get cached pointer since size was adjusted during first
+  // allocation
+  allocator.reset();
+
+  auto p3 = allocator.allocate(100, 256);
+  // Should reuse p1 due to alignment adjustment
+  EXPECT_EQ(p1, p3);
+}
+
+TEST_F(CPUCachingAllocatorTest, ResetMultipleTimes) {
+  CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
+
+  for (int i = 0; i < 3; ++i) {
+    auto p = allocator.allocate(512);
+    EXPECT_NE(p, nullptr);
+    allocator.reset();
+
+    auto p2 = allocator.allocate(512);
+    EXPECT_EQ(p, p2);
+  }
+}
diff --git a/extension/memory_allocator/test/targets.bzl b/extension/memory_allocator/test/targets.bzl
index 77fb6936a3a..5855bee5c14 100644
--- a/extension/memory_allocator/test/targets.bzl
+++ b/extension/memory_allocator/test/targets.bzl
@@ -15,3 +15,13 @@ def define_common_targets():
             "//executorch/extension/memory_allocator:malloc_memory_allocator",
         ],
     )
+
+    runtime.cxx_test(
+        name = "cpu_caching_malloc_allocator_test",
+        srcs = [
+            "cpu_caching_malloc_allocator_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/memory_allocator:cpu_caching_allocator",
+        ],
+    )
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index 8d8893f7454..0529c814f14 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -337,6 +337,10 @@ EXTENSION_FLAT_TENSOR_SRCS = [
     "extension/flat_tensor/serialize/flat_tensor_header.cpp",
 ]
 
+EXTENSION_MEMORY_ALLOCATOR_SRCS = [
+    "extension/memory_allocator/cpu_caching_malloc_allocator.cpp",
+]
+
 EXTENSION_MODULE_SRCS = [
     "extension/module/module.cpp",
 ]
diff --git a/tools/cmake/Codegen.cmake b/tools/cmake/Codegen.cmake
index 32d3d8b554f..e838e62c582 100644
--- a/tools/cmake/Codegen.cmake
+++ b/tools/cmake/Codegen.cmake
@@ -398,6 +398,7 @@ function(executorch_load_build_variables)
       EXTENSION_DATA_LOADER_SRCS
       EXTENSION_EVALUE_UTIL_SRCS
       EXTENSION_FLAT_TENSOR_SRCS
+      EXTENSION_MEMORY_ALLOCATOR_SRCS
       EXTENSION_MODULE_SRCS
       EXTENSION_NAMED_DATA_MAP_SRCS
       EXTENSION_RUNNER_UTIL_SRCS
@@ -431,6 +432,7 @@ function(executorch_load_build_variables)
       _extension_data_loader__srcs
       _extension_evalue_util__srcs
       _extension_flat_tensor__srcs
+      _extension_memory_allocator__srcs
       _extension_module__srcs
       _extension_named_data_map__srcs
       _extension_runner_util__srcs

From dbf63cc79a2d883d50dd1c6dc254a530318b645b Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 6 Nov 2025 12:51:18 -0800
Subject: [PATCH 06/28] Update base for Update on "[Executorch] Introduce
 caching cpu memory allocator"

Meant to use this for temp allocator for kernels. Specifically for sdpa, it seems that on iOS there is a significant overhead coming from allocations

Differential Revision: [D85532079](https://our.internmc.facebook.com/intern/diff/D85532079/)

[ghstack-poisoned]
---
 extension/llm/custom_ops/TARGETS                |  1 +
 extension/llm/custom_ops/op_sdpa_impl.h         | 11 +++++++----
 extension/llm/custom_ops/test_quantized_sdpa.py |  6 ++++++
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS
index 9a437e7dad5..5dda2318f3f 100644
--- a/extension/llm/custom_ops/TARGETS
+++ b/extension/llm/custom_ops/TARGETS
@@ -60,5 +60,6 @@ runtime.python_test(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/extension/pybindings:portable_lib",
     ],
 )
diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
index 3c682d8c4b1..21acd6130eb 100644
--- a/extension/llm/custom_ops/op_sdpa_impl.h
+++ b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -779,11 +779,13 @@ void cpu_flash_attention(
   // Lets align size_per_thread_qdq_vec to 64 bytes, for coalesced cache reads,
   // by padding with right number of per thread elements
   constexpr int64_t kAlignment = 32;
-  size_per_thread_qdq_vec = (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
-  int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * query.element_size();
+  size_per_thread_qdq_vec =
+      (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
+  int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * sizeof(accum_t);
   int64_t size_qdq_bytes = size_per_thread_qdq_bytes * num_thread;
   std::vector<char> scratch_for_quant_dequant_vec(size_qdq_bytes);
-  accum_t* scratch_for_quant_dequant = reinterpret_cast<accum_t*>(scratch_for_quant_dequant_vec.data());
+  accum_t* scratch_for_quant_dequant =
+      reinterpret_cast<accum_t*>(scratch_for_quant_dequant_vec.data());
 
   // Data ptrs
   const scalar_t* q_data = query.const_data_ptr<scalar_t>();
@@ -808,7 +810,8 @@ void cpu_flash_attention(
     scalar_t* qk_reduced_data = is_reduced_type
         ? buf_reduced_data + ompIdx * qSplitSize * kvSplitSize
         : nullptr;
-    accum_t* buf_qdq_ptr = scratch_for_quant_dequant + ompIdx * size_per_thread_qdq_vec;
+    accum_t* buf_qdq_ptr =
+        scratch_for_quant_dequant + ompIdx * size_per_thread_qdq_vec;
 
     for (int64_t z = begin; z < end; z++) {
       int64_t m = k * qSplitSize;
diff --git a/extension/llm/custom_ops/test_quantized_sdpa.py b/extension/llm/custom_ops/test_quantized_sdpa.py
index 87026d5c251..e6edf6ffbb1 100644
--- a/extension/llm/custom_ops/test_quantized_sdpa.py
+++ b/extension/llm/custom_ops/test_quantized_sdpa.py
@@ -12,6 +12,7 @@
 import torch.nn.functional as F
 
 from executorch.extension.llm.custom_ops import custom_ops  # noqa
+from executorch.extension.pybindings.portable_lib import _unsafe_reset_threadpool
 
 
 def is_fbcode():
@@ -40,6 +41,11 @@ def setUp(self):
         self.q_shape = None
         self.kv_shape = None
         self.is_seq_at_dim_2 = True
+        # For some reason 4 threads doesnt work
+        # This setting is needed to make this test not flaky due to OMP
+        # error of "OMP: Error #131: Thread identifier invalid"
+        # Not clear why that happens but having smaller threadpool resolves it
+        _unsafe_reset_threadpool(3)
 
     def _scale_tensor(self, tensor, min_value, max_value, scale=True):
         normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())

From 68d76d310737b4ded7b7cfc44a507d03df1a36d9 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Mon, 10 Nov 2025 20:34:35 -0800
Subject: [PATCH 07/28] Update base for Update on "[Executorch] Introduce
 caching cpu memory allocator"

Meant to use this for temp allocator for kernels. Specifically for sdpa, it seems that on iOS there is a significant overhead coming from allocations

Differential Revision: [D85532079](https://our.internmc.facebook.com/intern/diff/D85532079/)

[ghstack-poisoned]

From 351a400d4595c2cdf3a045bdb275ee6a1eeafbf1 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Mon, 10 Nov 2025 20:34:38 -0800
Subject: [PATCH 08/28] [Executorch] Use temp allocator for allocating scratch
 memory

This allows us to leverage temp memory allocator and if that allocator is caching allocator it reduces the allocaiton overhead.

Differential Revision: [D85532076](https://our.internmc.facebook.com/intern/diff/D85532076/)

[ghstack-poisoned]
---
 extension/llm/custom_ops/op_sdpa.cpp    |  6 ++++
 extension/llm/custom_ops/op_sdpa_impl.h | 39 ++++++++++++++++---------
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index c98fa1729fa..72bddce7b5b 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -273,6 +273,7 @@ Tensor& flash_attention_kernel_out(
         // we might consider another appraoch
         if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
+              ctx,
               output,
               query,
               key,
@@ -289,6 +290,7 @@ Tensor& flash_attention_kernel_out(
               nullopt);
         } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
+              ctx,
               output,
               query,
               key,
@@ -305,6 +307,7 @@ Tensor& flash_attention_kernel_out(
               nullopt);
         } else {
           sdpa::impl::cpu_flash_attention<CTYPE, 32, 512>(
+              ctx,
               output,
               query,
               key,
@@ -418,6 +421,7 @@ Tensor& custom_sdpa_out_impl(
         // we might consider another appraoch
         if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
+              ctx,
               output,
               q,
               k,
@@ -437,6 +441,7 @@ Tensor& custom_sdpa_out_impl(
               num_keys_for_causal_attention);
         } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
+              ctx,
               output,
               q,
               k,
@@ -456,6 +461,7 @@ Tensor& custom_sdpa_out_impl(
               num_keys_for_causal_attention);
         } else {
           sdpa::impl::cpu_flash_attention<CTYPE, 32, 512>(
+              ctx,
               output,
               q,
               k,
diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
index 21acd6130eb..a418992da3f 100644
--- a/extension/llm/custom_ops/op_sdpa_impl.h
+++ b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -35,6 +35,7 @@ enum class SeqDim { ONE = 1, TWO };
 
 namespace sdpa::impl {
 
+static std::vector<char> scratch_for_quant_dequant_vec;
 struct MaybeQuantizedMatrixData {
   const void* data{nullptr};
   const int8_t* zero_points{nullptr};
@@ -543,6 +544,7 @@ TODO: Just handle conversion of bool mask to float
  */
 template <typename scalar_t, int64_t q_split_size, int64_t kv_split_size>
 void cpu_flash_attention(
+    RuntimeContext& ctx,
     Tensor& output,
     const Tensor& query,
     const Tensor& key,
@@ -766,26 +768,37 @@ void cpu_flash_attention(
   int64_t size_of_intermediate_precision = sizeof(accum_t);
   int64_t size_bytes = size_per_thread * num_thread * query.element_size() *
       size_of_intermediate_precision;
-  std::vector<char> buf_vec(size_bytes);
-  void* buf = reinterpret_cast<void*>(buf_vec.data());
-  // Need to double check the following
-  size_bytes = num_thread * qSplitSize * kvSplitSize * query.element_size();
-  std::vector<char> buf_reduced_vec(size_bytes);
-  void* buf_reduced = reinterpret_cast<void*>(buf_reduced_vec.data());
-  // at::Tensor buf_reduced = at::empty(
-  //    {num_thread, qSplitSize, is_reduced_type ? kvSplitSize : 0},
-  //    query.options());
+  Result<void*> buff_res = ctx.allocate_temp(size_bytes);
+  std::unique_ptr<char[]> allocated_buf;
+  void* buf;
+  if (!buff_res.ok()) {
+    allocated_buf = std::make_unique<char[]>(size_bytes);
+    buf = reinterpret_cast<void*>(allocated_buf.get());
+  } else {
+    buf = buff_res.get();
+  }
+  void* buf_reduced = nullptr;
   int64_t size_per_thread_qdq_vec = qSplitSize * kvSplitSize * headSize;
   // Lets align size_per_thread_qdq_vec to 64 bytes, for coalesced cache reads,
   // by padding with right number of per thread elements
   constexpr int64_t kAlignment = 32;
   size_per_thread_qdq_vec =
       (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
-  int64_t size_per_thread_qdq_bytes = size_per_thread_qdq_vec * sizeof(accum_t);
+  int64_t size_per_thread_qdq_bytes =
+      size_per_thread_qdq_vec * size_of_intermediate_precision;
   int64_t size_qdq_bytes = size_per_thread_qdq_bytes * num_thread;
-  std::vector<char> scratch_for_quant_dequant_vec(size_qdq_bytes);
-  accum_t* scratch_for_quant_dequant =
-      reinterpret_cast<accum_t*>(scratch_for_quant_dequant_vec.data());
+  std::unique_ptr<char[]> allocated_buf_for_qdq;
+  Result<void*> scratch_for_quant_dequant_res =
+      ctx.allocate_temp(size_qdq_bytes);
+  accum_t* scratch_for_quant_dequant;
+  if (!scratch_for_quant_dequant_res.ok()) {
+    allocated_buf_for_qdq = std::make_unique<char[]>(size_qdq_bytes);
+    scratch_for_quant_dequant =
+        reinterpret_cast<accum_t*>(allocated_buf_for_qdq.get());
+  } else {
+    scratch_for_quant_dequant =
+        reinterpret_cast<accum_t*>(scratch_for_quant_dequant_res.get());
+  }
 
   // Data ptrs
   const scalar_t* q_data = query.const_data_ptr<scalar_t>();

From b4fdc2234e04212d3cec2cbadbc8dbf08e17f4f5 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Mon, 10 Nov 2025 20:34:43 -0800
Subject: [PATCH 09/28] [Executorch] Make module constructors uniform across

Existing constructors dont compose well such that if you want data loader or data files constructor then you cannot get to override memory allocator.
Fix that.

Differential Revision: [D86120037](https://our.internmc.facebook.com/intern/diff/D86120037/)

[ghstack-poisoned]
---
 extension/module/module.cpp | 30 ++++++++++++++++++++++++------
 extension/module/module.h   |  6 ++++++
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 35228d06729..b95f40b9f40 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -78,11 +78,17 @@ runtime::Result<std::unique_ptr<runtime::DataLoader>> make_data_loader(
 Module::Module(
     const std::string& file_path,
     const LoadMode load_mode,
+    std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
+    std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
     std::unique_ptr<runtime::EventTracer> event_tracer)
     : file_path_(file_path),
       load_mode_(load_mode),
-      memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
-      temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      memory_allocator_(
+          memory_allocator ? std::move(memory_allocator)
+                           : std::make_unique<MallocMemoryAllocator>()),
+      temp_allocator_(
+          temp_allocator ? std::move(temp_allocator)
+                         : std::make_unique<MallocMemoryAllocator>()),
       event_tracer_(std::move(event_tracer)) {
   runtime::runtime_init();
 }
@@ -91,11 +97,17 @@ Module::Module(
     const std::string& file_path,
     const std::string& data_map_path,
     const LoadMode load_mode,
+    std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
+    std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
     std::unique_ptr<runtime::EventTracer> event_tracer)
     : file_path_(file_path),
       load_mode_(load_mode),
-      memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
-      temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      memory_allocator_(
+          memory_allocator ? std::move(memory_allocator)
+                           : std::make_unique<MallocMemoryAllocator>()),
+      temp_allocator_(
+          temp_allocator ? std::move(temp_allocator)
+                         : std::make_unique<MallocMemoryAllocator>()),
       event_tracer_(std::move(event_tracer)) {
   if (!data_map_path.empty()) {
     data_files_.push_back(data_map_path);
@@ -107,12 +119,18 @@ Module::Module(
     const std::string& file_path,
     std::vector<std::string> data_files,
     const LoadMode load_mode,
+    std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
+    std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
     std::unique_ptr<runtime::EventTracer> event_tracer)
     : file_path_(file_path),
       data_files_(std::move(data_files)),
       load_mode_(load_mode),
-      memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
-      temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      memory_allocator_(
+          memory_allocator ? std::move(memory_allocator)
+                           : std::make_unique<MallocMemoryAllocator>()),
+      temp_allocator_(
+          temp_allocator ? std::move(temp_allocator)
+                         : std::make_unique<MallocMemoryAllocator>()),
       event_tracer_(std::move(event_tracer)) {
   runtime::runtime_init();
 }
diff --git a/extension/module/module.h b/extension/module/module.h
index e523f163317..10fc366cb04 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -63,6 +63,8 @@ class Module {
   explicit Module(
       const std::string& file_path,
       const LoadMode load_mode = LoadMode::File,
+      std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
+      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
       std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
   /**
@@ -78,6 +80,8 @@ class Module {
       const std::string& file_path,
       const std::string& data_map_path,
       const LoadMode load_mode = LoadMode::File,
+      std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
+      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
       std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
   /**
@@ -93,6 +97,8 @@ class Module {
       const std::string& file_path,
       std::vector<std::string> data_files,
       const LoadMode load_mode = LoadMode::File,
+      std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
+      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
       std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
   /**

From e73b365325fc0d8d6e1d83edff08f76a819b6683 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 20 Nov 2025 09:30:14 -0800
Subject: [PATCH 10/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 extension/memory_allocator/cpu_caching_malloc_allocator.h     | 4 +++-
 .../test/cpu_caching_malloc_allocator_test.cpp                | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.h b/extension/memory_allocator/cpu_caching_malloc_allocator.h
index bfddc7cd0bc..188f762a393 100644
--- a/extension/memory_allocator/cpu_caching_malloc_allocator.h
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.h
@@ -73,7 +73,9 @@ class CPUCachingAllocator : public executorch::runtime::MemoryAllocator {
   // Checks the cache to see if allocation of size bytes can be found.
   // If so return cached memory, else
   // allocates memory, records it for caching and returns.
-  void* allocate(size_t size, size_t alignment = kCachingAllocatorDefaultAlignment) override;
+  void* allocate(
+      size_t size,
+      size_t alignment = kCachingAllocatorDefaultAlignment) override;
   void reset() override;
   ~CPUCachingAllocator();
 };
diff --git a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
index 9d6265e1f09..c2e11144d43 100644
--- a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
+++ b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
@@ -12,7 +12,8 @@
 using namespace ::testing;
 using executorch::extension::CPUCachingAllocator;
 
-constexpr auto kDefaultAlignment = executorch::extension::kCachingAllocatorDefaultAlignment;
+constexpr auto kDefaultAlignment =
+    executorch::extension::kCachingAllocatorDefaultAlignment;
 
 class CPUCachingAllocatorTest : public ::testing::Test {
  protected:

From f12869c88c2742863b792091060d214c5faab5cd Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 20 Nov 2025 12:49:50 -0800
Subject: [PATCH 11/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 extension/llm/custom_ops/op_sdpa_impl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
index a418992da3f..2212c7d60b7 100644
--- a/extension/llm/custom_ops/op_sdpa_impl.h
+++ b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -778,10 +778,10 @@ void cpu_flash_attention(
     buf = buff_res.get();
   }
   void* buf_reduced = nullptr;
-  int64_t size_per_thread_qdq_vec = qSplitSize * kvSplitSize * headSize;
+  int64_t size_per_thread_qdq_vec = kvSplitSize * headSize;
   // Lets align size_per_thread_qdq_vec to 64 bytes, for coalesced cache reads,
   // by padding with right number of per thread elements
-  constexpr int64_t kAlignment = 32;
+  constexpr int64_t kAlignment = 64;
   size_per_thread_qdq_vec =
       (size_per_thread_qdq_vec + kAlignment - 1) & (-(kAlignment - 1));
   int64_t size_per_thread_qdq_bytes =

From ca1757a328e2bb63f14c4df733c2a673ced9dda8 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Sun, 23 Nov 2025 11:55:49 -0800
Subject: [PATCH 12/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 .../memory_allocator/cpu_caching_malloc_allocator.h      | 5 +++--
 .../test/cpu_caching_malloc_allocator_test.cpp           | 9 +++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.h b/extension/memory_allocator/cpu_caching_malloc_allocator.h
index af36c30d069..42affef7599 100644
--- a/extension/memory_allocator/cpu_caching_malloc_allocator.h
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.h
@@ -59,8 +59,9 @@ class CPUCachingAllocator : public executorch::runtime::MemoryAllocator {
   // New invariants must be written.
   FlatHashMap<size_t, SmallVector<void*, 16>> available_map_;
   FlatHashMap<void*, size_t> allocation_map_;
-  // Since allocation_map, which is a global instance, is mutated/read via
-  // all public APIs we need a global mutex.
+  // Since allocation_map_ and other member variables are mutated/read via
+  // all public APIs, we need a mutex to protect concurrent access to these
+  // instance members.
   std::mutex mutex_;
   size_t max_size_;
   size_t current_size_;
diff --git a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
index 0875be1491c..50892857135 100644
--- a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
+++ b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
@@ -280,7 +280,7 @@ TEST_F(CPUCachingAllocatorTest, SizeAlignmentAdjustment) {
   CPUCachingAllocator allocator(1024 * 1024); // 1MB max size
 
   // Test that allocation sizes get properly aligned
-  auto p1 = allocator.allocate(100, 256); // Size not aligned to 256
+  auto p1 = allocator.allocate(100, 256); // Size aligned to 256
   EXPECT_NE(p1, nullptr);
   EXPECT_ALIGNED(p1, 256);
 
@@ -289,9 +289,9 @@ TEST_F(CPUCachingAllocatorTest, SizeAlignmentAdjustment) {
   // allocation
   allocator.reset();
 
-  auto p3 = allocator.allocate(100, 256);
+  auto p3 = allocator.allocate(100, 512);
   // Should reuse p1 due to alignment adjustment
-  EXPECT_EQ(p1, p3);
+  EXPECT_NE(p1, p3);
 }
 
 TEST_F(CPUCachingAllocatorTest, ResetMultipleTimes) {
@@ -348,7 +348,8 @@ TEST_F(CPUCachingAllocatorTest, ResetCachesWhenUnderMaxSize) {
   EXPECT_NE(p1, nullptr);
   EXPECT_NE(p2, nullptr);
 
-  // Reset should cache the allocations since current_size (1024) <= max_size (2048)
+  // Reset should cache the allocations since current_size (1024) <= max_size
+  // (2048)
   allocator.reset();
 
   // Subsequent allocations should reuse the cached pointers

From a4912c51c287869734dbdf0fc9046d666e0a7fd9 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Sun, 23 Nov 2025 15:09:42 -0800
Subject: [PATCH 13/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 extension/module/module.cpp | 12 ++++++------
 extension/module/module.h   | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index b95f40b9f40..f11aa7200c1 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -78,9 +78,9 @@ runtime::Result<std::unique_ptr<runtime::DataLoader>> make_data_loader(
 Module::Module(
     const std::string& file_path,
     const LoadMode load_mode,
+    std::unique_ptr<runtime::EventTracer> event_tracer,
     std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
-    std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
-    std::unique_ptr<runtime::EventTracer> event_tracer)
+    std::unique_ptr<runtime::MemoryAllocator> temp_allocator)
     : file_path_(file_path),
       load_mode_(load_mode),
       memory_allocator_(
@@ -97,9 +97,9 @@ Module::Module(
     const std::string& file_path,
     const std::string& data_map_path,
     const LoadMode load_mode,
+    std::unique_ptr<runtime::EventTracer> event_tracer,
     std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
-    std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
-    std::unique_ptr<runtime::EventTracer> event_tracer)
+    std::unique_ptr<runtime::MemoryAllocator> temp_allocator)
     : file_path_(file_path),
       load_mode_(load_mode),
       memory_allocator_(
@@ -119,9 +119,9 @@ Module::Module(
     const std::string& file_path,
     std::vector<std::string> data_files,
     const LoadMode load_mode,
+    std::unique_ptr<runtime::EventTracer> event_tracer,
     std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
-    std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
-    std::unique_ptr<runtime::EventTracer> event_tracer)
+    std::unique_ptr<runtime::MemoryAllocator> temp_allocator)
     : file_path_(file_path),
       data_files_(std::move(data_files)),
       load_mode_(load_mode),
diff --git a/extension/module/module.h b/extension/module/module.h
index 10fc366cb04..6d8da026abe 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -63,9 +63,9 @@ class Module {
   explicit Module(
       const std::string& file_path,
       const LoadMode load_mode = LoadMode::File,
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
-      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr);
 
   /**
    * Constructs an instance by loading a program from a file with specified
@@ -80,9 +80,9 @@ class Module {
       const std::string& file_path,
       const std::string& data_map_path,
       const LoadMode load_mode = LoadMode::File,
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
-      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr);
 
   /**
    * Constructs an instance by loading a program from a file with specified
@@ -97,9 +97,9 @@ class Module {
       const std::string& file_path,
       std::vector<std::string> data_files,
       const LoadMode load_mode = LoadMode::File,
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
-      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+      std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr);
 
   /**
    * Constructs an instance with the provided data loader and memory allocator.

From 39cd25def5f5873d0fc9bc9860539ca17572107d Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Mon, 24 Nov 2025 08:38:50 -0800
Subject: [PATCH 14/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 .../test/cpu_caching_malloc_allocator_test.cpp            | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
index 50892857135..df7f82d24a1 100644
--- a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
+++ b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
@@ -330,12 +330,8 @@ TEST_F(CPUCachingAllocatorTest, ResetFreesEverythingWhenOverMaxSize) {
   EXPECT_NE(p5, nullptr);
 
   // These should be new allocations, not cached ones
-  EXPECT_NE(p4, p1);
-  EXPECT_NE(p4, p2);
-  EXPECT_NE(p4, p3);
-  EXPECT_NE(p5, p1);
-  EXPECT_NE(p5, p2);
-  EXPECT_NE(p5, p3);
+  // However, system allocator might cache and return the same pointesr
+  // so we can't check for strict equality or inequality
 }
 
 TEST_F(CPUCachingAllocatorTest, ResetCachesWhenUnderMaxSize) {

From 5bce9566f09a2af00ce5a58fcfccf0399af7ee3a Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Mon, 24 Nov 2025 10:04:15 -0800
Subject: [PATCH 15/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]

From 0bf3b2e0b25b6ed64a46f721d99b40595c5425e4 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 3 Dec 2025 19:41:25 -0800
Subject: [PATCH 16/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]

From d83b4a9fa81db13c0687451824f4a46c16d9a893 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 3 Dec 2025 19:54:29 -0800
Subject: [PATCH 17/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]

From 2d7994584b0943222ce755319dbd30072fb8097f Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 4 Dec 2025 08:34:23 -0800
Subject: [PATCH 18/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]

From 365be542c21be1060ae0e602dcf1c87771355edb Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 4 Dec 2025 08:49:36 -0800
Subject: [PATCH 19/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]

From ba270072f0b7a8abb09b53082e2f5b40f1421598 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 4 Dec 2025 08:57:49 -0800
Subject: [PATCH 20/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]

From 20854fc6641885ba4d003d7c0eceff46f367c1ae Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 4 Dec 2025 12:51:23 -0800
Subject: [PATCH 21/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]

From 36cce27011b23e943de15b4e1b94f20956071d9a Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 4 Dec 2025 13:04:42 -0800
Subject: [PATCH 22/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]

From bae4829e527315bdfdf32e902d40809a37836f86 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 5 Dec 2025 08:01:35 -0800
Subject: [PATCH 23/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 .../cpu_caching_malloc_allocator.cpp          | 24 +++++++------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
index af541e66d98..1fe5f09df7a 100644
--- a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
@@ -4,18 +4,6 @@
 
 namespace executorch::extension {
 
-namespace {
-size_t get_alignment_adjusted_size(size_t size, size_t alignment) {
-  if (size % alignment != 0) {
-    // Adjust size to the next multiple of alignment
-    // This is needed for aligned_alloc to work
-    return (size + alignment) & ~(alignment - 1);
-  } else {
-    return size;
-  }
-}
-} // namespace
-
 CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size)
     : MemoryAllocator(0, nullptr) {
   max_size_ = max_size;
@@ -30,7 +18,6 @@ void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
     return nullptr;
   }
   alignment = std::max(alignment, kCachingAllocatorDefaultAlignment);
-  size = get_alignment_adjusted_size(size, alignment);
 
   std::lock_guard<std::mutex> guard(mutex_);
   const auto& it = available_map_.find(size);
@@ -39,7 +26,14 @@ void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
   // 2. Allocate new memory
   // 2 can lead to current_size > max_size_
   if (it == available_map_.end() || it->second.empty()) {
-    void* ptr = std::malloc(size);
+    void* ptr = nullptr;
+#if defined(__ANDROID__)
+    ptr = memalign(alignment, size);
+#elif defined(_MSC_VER)
+    ptr = _aligned_malloc(size, alignment);
+#else
+    ptr = std::aligned_alloc(alignment, size);
+#endif
     if (ptr == nullptr) {
       ET_LOG(Error, "Failed to allocate memory");
       return nullptr;
@@ -51,7 +45,7 @@ void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
   void* ptr = it->second.back();
   it->second.pop_back();
   allocation_map_[ptr] = size;
-  return ptr;
+  return alignPointer(ptr, alignment);
 }
 
 void CPUCachingAllocator::free_everything() {

From 71cc53200622b42d0dc560e61da7619fb976f77d Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 5 Dec 2025 10:27:46 -0800
Subject: [PATCH 24/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 .../cpu_caching_malloc_allocator.cpp          | 15 ++++---
 .../cpu_caching_malloc_allocator.h            |  8 +++-
 .../malloc_memory_allocator.h                 | 18 +++------
 .../memory_allocator/memory_allocator_utils.h | 39 +++++++++++++++++++
 extension/memory_allocator/targets.bzl        |  6 +++
 .../cpu_caching_malloc_allocator_test.cpp     |  3 +-
 6 files changed, 66 insertions(+), 23 deletions(-)
 create mode 100644 extension/memory_allocator/memory_allocator_utils.h

diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
index 1fe5f09df7a..084ca2eb15a 100644
--- a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
@@ -1,6 +1,7 @@
 #include <cstdlib>
 
 #include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
+#include <executorch/extension/memory_allocator/memory_allocator_utils.h>
 
 namespace executorch::extension {
 
@@ -18,6 +19,11 @@ void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
     return nullptr;
   }
   alignment = std::max(alignment, kCachingAllocatorDefaultAlignment);
+  size_t adjusted_size = executorch::extension::utils::get_aligned_size(size, alignment);
+  if (adjusted_size == 0) {
+    return nullptr;
+  }
+  size = adjusted_size;
 
   std::lock_guard<std::mutex> guard(mutex_);
   const auto& it = available_map_.find(size);
@@ -26,14 +32,7 @@ void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
   // 2. Allocate new memory
   // 2 can lead to current_size > max_size_
   if (it == available_map_.end() || it->second.empty()) {
-    void* ptr = nullptr;
-#if defined(__ANDROID__)
-    ptr = memalign(alignment, size);
-#elif defined(_MSC_VER)
-    ptr = _aligned_malloc(size, alignment);
-#else
-    ptr = std::aligned_alloc(alignment, size);
-#endif
+    void* ptr = std::malloc(size);
     if (ptr == nullptr) {
       ET_LOG(Error, "Failed to allocate memory");
       return nullptr;
diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.h b/extension/memory_allocator/cpu_caching_malloc_allocator.h
index 42affef7599..9f352c9aaa1 100644
--- a/extension/memory_allocator/cpu_caching_malloc_allocator.h
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.h
@@ -70,7 +70,13 @@ class CPUCachingAllocator : public executorch::runtime::MemoryAllocator {
   /*
     max_size: Maximum size of memory to cache. Never cache more than that.
   */
-  CPUCachingAllocator(uint32_t max_size);
+  explicit CPUCachingAllocator(uint32_t max_size);
+  // No copies allowed
+  CPUCachingAllocator(const CPUCachingAllocator&) = delete;
+  CPUCachingAllocator& operator=(const CPUCachingAllocator&) = delete;
+  // No moves allowed
+  CPUCachingAllocator(CPUCachingAllocator&&) = delete;
+  CPUCachingAllocator& operator=(CPUCachingAllocator&&) = delete;
   // Checks the cache to see if allocation of size bytes can be found.
   // If so return cached memory, else
   // allocates memory, records it for caching and returns.
diff --git a/extension/memory_allocator/malloc_memory_allocator.h b/extension/memory_allocator/malloc_memory_allocator.h
index 3dede4ac6fd..06c60c8c571 100644
--- a/extension/memory_allocator/malloc_memory_allocator.h
+++ b/extension/memory_allocator/malloc_memory_allocator.h
@@ -13,6 +13,7 @@
 #include <cstdlib>
 #include <vector>
 
+#include <executorch/extension/memory_allocator/memory_allocator_utils.h>
 #include <executorch/runtime/core/memory_allocator.h>
 
 namespace executorch {
@@ -51,20 +52,11 @@ class MallocMemoryAllocator : public executorch::runtime::MemoryAllocator {
       return nullptr;
     }
 
-    // The minimum alignment that malloc() is guaranteed to provide.
-    static constexpr size_t kMallocAlignment = alignof(std::max_align_t);
-    if (alignment > kMallocAlignment) {
-      // To get higher alignments, allocate extra and then align the returned
-      // pointer. This will waste an extra `alignment - 1` bytes every time, but
-      // this is the only portable way to get aligned memory from the heap.
-      const size_t extra = alignment - 1;
-      if ET_UNLIKELY (extra >= SIZE_MAX - size) {
-        ET_LOG(
-            Error, "Malloc size overflow: size=%zu + extra=%zu", size, extra);
-        return nullptr;
-      }
-      size += extra;
+    size_t adjusted_size = executorch::extension::utils::get_aligned_size(size, alignment);
+    if (adjusted_size == 0) {
+      return nullptr;
     }
+    size = adjusted_size;
     void* mem_ptr = std::malloc(size);
     if (!mem_ptr) {
       ET_LOG(Error, "Malloc failed to allocate %zu bytes", size);
diff --git a/extension/memory_allocator/memory_allocator_utils.h b/extension/memory_allocator/memory_allocator_utils.h
new file mode 100644
index 00000000000..3911f952e57
--- /dev/null
+++ b/extension/memory_allocator/memory_allocator_utils.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch::extension::utils {
+
+// Util to get alighment adjusted allocation size
+inline size_t get_aligned_size(size_t size, size_t alignment) {
+  // The minimum alignment that malloc() is guaranteed to provide.
+  static constexpr size_t kMallocAlignment = alignof(std::max_align_t);
+  if (alignment > kMallocAlignment) {
+    // To get higher alignments, allocate extra and then align the returned
+    // pointer. This will waste an extra `alignment - 1` bytes every time, but
+    // this is the only portable way to get aligned memory from the heap.
+    const size_t extra = alignment - 1;
+    if ET_UNLIKELY (extra >= SIZE_MAX - size) {
+      ET_LOG(
+          Error, "Malloc size overflow: size=%zu + extra=%zu", size, extra);
+      return 0;
+    }
+    size += extra;
+  }
+  return size;
+}
+
+} // namespace executorch::extension::utils
diff --git a/extension/memory_allocator/targets.bzl b/extension/memory_allocator/targets.bzl
index f51b084e4ee..f6fd6782cf9 100644
--- a/extension/memory_allocator/targets.bzl
+++ b/extension/memory_allocator/targets.bzl
@@ -9,6 +9,9 @@ def define_common_targets():
 
     runtime.cxx_library(
         name = "malloc_memory_allocator",
+        headers = [
+            "memory_allocator_utils.h",
+        ],
         exported_headers = [
             "malloc_memory_allocator.h",
         ],
@@ -23,6 +26,9 @@ def define_common_targets():
 
     runtime.cxx_library(
         name = "cpu_caching_allocator",
+        headers = [
+            "memory_allocator_utils.h",
+        ],
         srcs = [
             "cpu_caching_malloc_allocator.cpp",
         ],
diff --git a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
index df7f82d24a1..d9f07fea07b 100644
--- a/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
+++ b/extension/memory_allocator/test/cpu_caching_malloc_allocator_test.cpp
@@ -241,6 +241,7 @@ TEST_F(CPUCachingAllocatorTest, ThreadSafety) {
   };
 
   // Create threads
+  threads.reserve(num_threads);
   for (int i = 0; i < num_threads; ++i) {
     threads.emplace_back(thread_work);
   }
@@ -284,7 +285,7 @@ TEST_F(CPUCachingAllocatorTest, SizeAlignmentAdjustment) {
   EXPECT_NE(p1, nullptr);
   EXPECT_ALIGNED(p1, 256);
 
-  auto p2 = allocator.allocate(100, 256);
+  allocator.allocate(100, 256);
   // Should not get cached pointer since size was adjusted during first
   // allocation
   allocator.reset();

From 230cd245b5bf1e67e7c02784bb69f8e2c8bcba76 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 5 Dec 2025 13:39:36 -0800
Subject: [PATCH 25/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 extension/memory_allocator/malloc_memory_allocator.h | 6 +-----
 extension/memory_allocator/targets.bzl               | 8 ++------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/extension/memory_allocator/malloc_memory_allocator.h b/extension/memory_allocator/malloc_memory_allocator.h
index 06c60c8c571..c9aca8fb100 100644
--- a/extension/memory_allocator/malloc_memory_allocator.h
+++ b/extension/memory_allocator/malloc_memory_allocator.h
@@ -52,11 +52,7 @@ class MallocMemoryAllocator : public executorch::runtime::MemoryAllocator {
       return nullptr;
     }
 
-    size_t adjusted_size = executorch::extension::utils::get_aligned_size(size, alignment);
-    if (adjusted_size == 0) {
-      return nullptr;
-    }
-    size = adjusted_size;
+    size = executorch::extension::utils::get_aligned_size(size, alignment);
     void* mem_ptr = std::malloc(size);
     if (!mem_ptr) {
       ET_LOG(Error, "Malloc failed to allocate %zu bytes", size);
diff --git a/extension/memory_allocator/targets.bzl b/extension/memory_allocator/targets.bzl
index f6fd6782cf9..82976dfefe8 100644
--- a/extension/memory_allocator/targets.bzl
+++ b/extension/memory_allocator/targets.bzl
@@ -9,11 +9,9 @@ def define_common_targets():
 
     runtime.cxx_library(
         name = "malloc_memory_allocator",
-        headers = [
-            "memory_allocator_utils.h",
-        ],
         exported_headers = [
             "malloc_memory_allocator.h",
+            "memory_allocator_utils.h",
         ],
         exported_deps = [
             "//executorch/runtime/core:memory_allocator",
@@ -26,14 +24,12 @@ def define_common_targets():
 
     runtime.cxx_library(
         name = "cpu_caching_allocator",
-        headers = [
-            "memory_allocator_utils.h",
-        ],
         srcs = [
             "cpu_caching_malloc_allocator.cpp",
         ],
         exported_headers = [
             "cpu_caching_malloc_allocator.h",
+            "memory_allocator_utils.h",
         ],
         exported_deps = [
             "//executorch/runtime/core:memory_allocator",

From 997b5e2542adf4d8ceecd15da42db3090bbb7d31 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 5 Dec 2025 14:02:18 -0800
Subject: [PATCH 26/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 extension/memory_allocator/cpu_caching_malloc_allocator.cpp | 3 ++-
 extension/memory_allocator/memory_allocator_utils.h         | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
index 084ca2eb15a..6855e924a16 100644
--- a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
@@ -19,7 +19,8 @@ void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
     return nullptr;
   }
   alignment = std::max(alignment, kCachingAllocatorDefaultAlignment);
-  size_t adjusted_size = executorch::extension::utils::get_aligned_size(size, alignment);
+  size_t adjusted_size =
+      executorch::extension::utils::get_aligned_size(size, alignment);
   if (adjusted_size == 0) {
     return nullptr;
   }
diff --git a/extension/memory_allocator/memory_allocator_utils.h b/extension/memory_allocator/memory_allocator_utils.h
index 3911f952e57..de9194358fb 100644
--- a/extension/memory_allocator/memory_allocator_utils.h
+++ b/extension/memory_allocator/memory_allocator_utils.h
@@ -27,8 +27,7 @@ inline size_t get_aligned_size(size_t size, size_t alignment) {
     // this is the only portable way to get aligned memory from the heap.
     const size_t extra = alignment - 1;
     if ET_UNLIKELY (extra >= SIZE_MAX - size) {
-      ET_LOG(
-          Error, "Malloc size overflow: size=%zu + extra=%zu", size, extra);
+      ET_LOG(Error, "Malloc size overflow: size=%zu + extra=%zu", size, extra);
       return 0;
     }
     size += extra;

From 7590e9c1fdff08f5a06eb8f1669de04e424b3af6 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 5 Dec 2025 15:59:44 -0800
Subject: [PATCH 27/28] Update base for Update on "[Executorch][LLM] Use
 caching allocator for runner"

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.

Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)

[ghstack-poisoned]
---
 .../memory_allocator/cpu_caching_malloc_allocator.cpp      | 6 +++---
 extension/memory_allocator/malloc_memory_allocator.h       | 7 ++++++-
 extension/memory_allocator/memory_allocator_utils.h        | 7 +++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
index 6855e924a16..4f0ec4ffb33 100644
--- a/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
+++ b/extension/memory_allocator/cpu_caching_malloc_allocator.cpp
@@ -19,12 +19,12 @@ void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
     return nullptr;
   }
   alignment = std::max(alignment, kCachingAllocatorDefaultAlignment);
-  size_t adjusted_size =
+  auto adjusted_size_value =
       executorch::extension::utils::get_aligned_size(size, alignment);
-  if (adjusted_size == 0) {
+  if (!adjusted_size_value.ok()) {
     return nullptr;
   }
-  size = adjusted_size;
+  size = adjusted_size_value.get();
 
   std::lock_guard<std::mutex> guard(mutex_);
   const auto& it = available_map_.find(size);
diff --git a/extension/memory_allocator/malloc_memory_allocator.h b/extension/memory_allocator/malloc_memory_allocator.h
index c9aca8fb100..1b924e36baa 100644
--- a/extension/memory_allocator/malloc_memory_allocator.h
+++ b/extension/memory_allocator/malloc_memory_allocator.h
@@ -52,7 +52,12 @@ class MallocMemoryAllocator : public executorch::runtime::MemoryAllocator {
       return nullptr;
     }
 
-    size = executorch::extension::utils::get_aligned_size(size, alignment);
+    auto adjusted_size_value =
+        executorch::extension::utils::get_aligned_size(size, alignment);
+    if (!adjusted_size_value.ok()) {
+      return nullptr;
+    }
+    size = adjusted_size_value.get();
     void* mem_ptr = std::malloc(size);
     if (!mem_ptr) {
       ET_LOG(Error, "Malloc failed to allocate %zu bytes", size);
diff --git a/extension/memory_allocator/memory_allocator_utils.h b/extension/memory_allocator/memory_allocator_utils.h
index de9194358fb..079537e60cd 100644
--- a/extension/memory_allocator/memory_allocator_utils.h
+++ b/extension/memory_allocator/memory_allocator_utils.h
@@ -13,12 +13,15 @@
 #include <cstdlib>
 
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/compiler.h>
 
+using executorch::runtime::Error;
+using executorch::runtime::Result;
 namespace executorch::extension::utils {
 
 // Util to get alighment adjusted allocation size
-inline size_t get_aligned_size(size_t size, size_t alignment) {
+inline Result<size_t> get_aligned_size(size_t size, size_t alignment) {
   // The minimum alignment that malloc() is guaranteed to provide.
   static constexpr size_t kMallocAlignment = alignof(std::max_align_t);
   if (alignment > kMallocAlignment) {
@@ -28,7 +31,7 @@ inline size_t get_aligned_size(size_t size, size_t alignment) {
     const size_t extra = alignment - 1;
     if ET_UNLIKELY (extra >= SIZE_MAX - size) {
       ET_LOG(Error, "Malloc size overflow: size=%zu + extra=%zu", size, extra);
-      return 0;
+      return Result<size_t>(Error::InvalidArgument);
     }
     size += extra;
   }

From 6ebb435654f4b71e229a640ea066bb1911ca0f3a Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@meta.com>
Date: Mon, 6 Apr 2026 08:18:09 -0700
Subject: [PATCH 28/28] Use caching allocator for runner (#15730)

Summary:

We observed that on iOS it improves perf by 6% because SDPA op does temp allocations.

No significant difference on android though.
ghstack-source-id: 328001114
exported-using-ghexport

Reviewed By: navsud, derekdixu

Differential Revision: D86120038
---
 CMakeLists.txt                             |  2 ++
 extension/llm/runner/CMakeLists.txt        |  2 +-
 extension/llm/runner/llm_runner_helper.cpp | 21 +++++++++++++++++++--
 extension/llm/runner/targets.bzl           |  1 +
 4 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c2251f933b..cefdebb2c2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1098,6 +1098,8 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/memory_allocator)
+  list(APPEND _executorch_extensions extension_memory_allocator)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
   list(APPEND _executorch_extensions extension_llm_runner)
 endif()
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 655c2610ade..16264b274b4 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -40,7 +40,7 @@ add_subdirectory(
 )
 
 set(runner_deps executorch_core extension_module extension_tensor
-                extension_llm_sampler tokenizers::tokenizers
+                extension_llm_sampler extension_memory_allocator tokenizers::tokenizers
 )
 
 # depend on arange_utils
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index 6038353fdb5..933e7cf9ae5 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -17,6 +17,7 @@
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
+#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
@@ -223,12 +224,28 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 
   // Create the Module
   std::unique_ptr<Module> module;
+  uint32_t max_cached_memory_size_bytes_ = 1024 * 1024 * 10; // 10MB
   if (data_files.size() > 0) {
     module = std::make_unique<Module>(
-        model_path, data_files, load_mode, std::move(event_tracer));
+        model_path,
+        data_files,
+        load_mode,
+        std::move(event_tracer),
+        nullptr, // memory allocator
+        std::make_unique<
+            executorch::extension::CPUCachingAllocator>( // temp memory
+                                                         // allocator
+            max_cached_memory_size_bytes_));
   } else {
     module = std::make_unique<Module>(
-        model_path, load_mode, std::move(event_tracer));
+        model_path,
+        load_mode,
+        std::move(event_tracer), // event tracer
+        nullptr, // memory allocator
+        std::make_unique<
+            executorch::extension::CPUCachingAllocator>( // temp memory
+                                                         // allocator
+            max_cached_memory_size_bytes_));
   }
 
   // Get metadata from Module
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 2c9000d0137..0d4ed99308d 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -132,6 +132,7 @@ def define_common_targets():
                 ":text_prefiller" + aten_suffix,
                 ":text_token_generator" + aten_suffix,
                 "//executorch/extension/llm/runner/io_manager:io_manager" + aten_suffix,
+                "//executorch/extension/memory_allocator:cpu_caching_allocator",
                 "//pytorch/tokenizers:hf_tokenizer",
                 "//pytorch/tokenizers:llama2c_tokenizer",
                 "//pytorch/tokenizers:sentencepiece",