From c19770103c451915e46e61f8dc3d519c516903e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Fri, 9 Jan 2026 17:47:26 +0100
Subject: [PATCH] PoC Cortex_m backend: Add support for scratch buffers

Use exir.memory.alloc for CMSIS-NN scratch buffers, which is ideal
since it has a TensorSpec and gets memory planned but creates no
additional operator overhead.
Use CMSIS-NN pybind wrapper to get correct buffer size.

Change-Id: Ia7ec8eda87833888a0639b480e531fd17818298a
---
 backends/cortex_m/ops/op_quantized_conv2d.cpp |  21 +++-
 .../ops/op_quantized_depthwise_conv2d.cpp     |  23 +++-
 backends/cortex_m/ops/operators.py            |  10 +-
 backends/cortex_m/ops/operators.yaml          |   7 +-
 .../passes/convert_to_cortex_m_pass.py        | 107 +++++++++++++++++-
 .../executor_runner/arm_executor_runner.cpp   |  18 +--
 6 files changed, 171 insertions(+), 15 deletions(-)
diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp
index ad14af98865..d2106e4022a 100644
--- a/backends/cortex_m/ops/op_quantized_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2025 Arm Limited and/or its affiliates.
+ * Copyright 2025-2026 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -105,6 +105,7 @@ bool validate_conv2d_arguments(
 Tensor& quantized_conv2d_out(
     KernelRuntimeContext& context,
     const Tensor& input,
+    const Tensor& scratch,
     const Tensor& weight,
     const torch::executor::optional<Tensor>& bias,
     const IntArrayRef stride,
@@ -190,6 +191,22 @@ Tensor& quantized_conv2d_out(
 
   const size_t buffer_bytes = static_cast<size_t>(
       arm_convolve_s8_get_buffer_size(&input_dims, &filter_dims));
+
+#if 1
+  cmsis_context.buf = scratch.mutable_data_ptr<int8_t>();
+  cmsis_context.size = buffer_bytes;
+
+  if (scratch.nbytes() != buffer_bytes) {
+    ET_LOG(
+        Error,
+        "quantized_dw_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
+        static_cast<int>(scratch.nbytes()),
+        static_cast<int>(buffer_bytes));
+    return out;
+  }
+
+#else
+
   if (buffer_bytes > 0) {
     auto buffer_or_error =
         context.allocate_temp(buffer_bytes, alignof(int16_t));
@@ -207,7 +224,7 @@ Tensor& quantized_conv2d_out(
       cmsis_context.size = buffer_bytes;
     }
   }
-
+#endif
   const arm_cmsis_nn_status status = arm_convolve_wrapper_s8(
       &cmsis_context,
       &conv_params,
diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
index 9cf0625ec7b..dd2168bcf76 100644
--- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
+ * Copyright 2026 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -143,6 +144,7 @@ bool validate_depthwise_conv2d_arguments(
 Tensor& quantized_depthwise_conv2d_out(
     KernelRuntimeContext& context,
     const Tensor& input,
+    const Tensor& scratch,
     const Tensor& weight,
     const torch::executor::optional<Tensor>& bias,
     const IntArrayRef stride,
@@ -237,6 +239,25 @@ Tensor& quantized_depthwise_conv2d_out(
     return out;
   }
 
+#if 1
+  cmsis_context.buf = scratch.mutable_data_ptr<int8_t>();
+  cmsis_context.size = buffer_bytes;
+  ET_LOG(
+      Info,
+      "quantized_dw_conv2d_out: scratch buffer size - actual: (%d) needed: (%d)",
+      static_cast<int>(scratch.nbytes()),
+      static_cast<int>(buffer_bytes));
+
+  if (scratch.nbytes() < buffer_bytes) {
+    ET_LOG(
+        Error,
+        "quantized_dw_conv2d_out: scratch buffer not big enough - actual: (%d) needed: (%d)",
+        static_cast<int>(scratch.nbytes()),
+        static_cast<int>(buffer_bytes));
+    return out;
+  }
+#else
+
   auto buffer_or_error = context.allocate_temp(
       static_cast<size_t>(buffer_bytes), alignof(int16_t));
   if (!buffer_or_error.ok()) {
@@ -250,7 +271,7 @@ Tensor& quantized_depthwise_conv2d_out(
   }
   cmsis_context.buf = buffer_or_error.get();
   cmsis_context.size = buffer_bytes;
-
+#endif
   const arm_cmsis_nn_status status = arm_depthwise_conv_wrapper_s8(
       &cmsis_context,
       &dw_conv_params,
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
index c62cac6a2be..24db11f4c9d 100644
--- a/backends/cortex_m/ops/operators.py
+++ b/backends/cortex_m/ops/operators.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -494,6 +494,7 @@ def transpose_impl(input: torch.Tensor, perm) -> torch.Tensor:
 lib.define(
     "quantized_conv2d("
     "Tensor input, "
+    "Tensor scratch, "
     "Tensor weight, "
     "Tensor? bias, "
     "int[] stride, "
@@ -512,6 +513,7 @@ def transpose_impl(input: torch.Tensor, perm) -> torch.Tensor:
 lib.define(
     "quantized_conv2d.out("
     "Tensor input, "
+    "Tensor scratch, "
     "Tensor weight, "
     "Tensor? bias, "
     "int[] stride, "
@@ -588,6 +590,7 @@ def _compute_depthwise_conv2d_output_shape(
 @register_fake("cortex_m::quantized_conv2d")
 def quantized_conv2d_meta(
     input: torch.Tensor,
+    scratch: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor | None,
     stride: Sequence[int],
@@ -617,6 +620,7 @@ def quantized_conv2d_meta(
 @impl(lib, "quantized_conv2d", "CompositeExplicitAutograd")
 def quantized_conv2d_impl(
     input: torch.Tensor,
+    scratch: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor | None,
     stride: Sequence[int],
@@ -683,6 +687,7 @@ def quantized_conv2d_impl(
 lib.define(
     "quantized_depthwise_conv2d("
     "Tensor input, "
+    "Tensor scratch, "
     "Tensor weight, "
     "Tensor? bias, "
     "int[] stride, "
@@ -702,6 +707,7 @@ def quantized_conv2d_impl(
 lib.define(
     "quantized_depthwise_conv2d.out("
     "Tensor input, "
+    "Tensor scratch, "
     "Tensor weight, "
     "Tensor? bias, "
     "int[] stride, "
@@ -722,6 +728,7 @@ def quantized_conv2d_impl(
 @register_fake("cortex_m::quantized_depthwise_conv2d")
 def quantized_depthwise_conv2d_meta(
     input: torch.Tensor,
+    scratch: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor | None,
     stride: Sequence[int],
@@ -752,6 +759,7 @@ def quantized_depthwise_conv2d_meta(
 @impl(lib, "quantized_depthwise_conv2d", "CompositeExplicitAutograd")
 def quantized_depthwise_conv2d_impl(
     input: torch.Tensor,
+    scratch: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor | None,
     stride: Sequence[int],
diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
index d1222c3c0b0..c9d6010eb8c 100644
--- a/backends/cortex_m/ops/operators.yaml
+++ b/backends/cortex_m/ops/operators.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -59,13 +59,14 @@
     - arg_meta: null
       kernel_name: cortex_m::transpose_out
 
-- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!)
+- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor scratch, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
       kernel_name: cortex_m::quantized_conv2d_out
 
-- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor scratch, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
index bed82c757bc..c23244c657e 100644
--- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -1,15 +1,21 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 
+from typing import Sequence
+
 import executorch.backends.cortex_m.ops.operators  # noqa
+import executorch.exir as exir
 
 import torch
 import torch.fx
+
+from cmsisnn_sizes import convolve_wrapper_s8_buffer_size_mve
+
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
 
@@ -20,10 +26,80 @@
 
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.tensor import TensorSpec
 from torch.export.graph_signature import InputKind
 from torch.fx.passes.infra.pass_manager import PassResult
 
 
+def shape_from_node(n: torch.fx.Node) -> list[int]:
+    spec = n.meta.get("spec", None)
+    if spec is not None and getattr(spec, "shape", None) is not None:
+        return [int(s) for s in spec.shape]
+
+    v = n.meta.get("val", None)
+    if v is not None:
+        if isinstance(v, (tuple, list)):
+            v = v[0]
+        return [int(s) for s in v.shape]
+
+    raise KeyError(f"No shape meta on node {n.format_node()} (need spec or val)")
+
+
+def cmsisnn_conv_s8_required_bytes_mve(
+    *,
+    x: torch.fx.Node,
+    conv_node: torch.fx.Node,
+    weight_shape: Sequence[int],
+    stride: Sequence[int],
+    padding: Sequence[int],
+    dilation: Sequence[int],
+    input_zero_point: int,
+    output_zero_point: int,
+    output_qmin: int,
+    output_qmax: int,
+) -> int:
+    # Input is NCHW (PyTorch); CMSIS-NN wants NHWC dims.
+    N, C_in, H, W = shape_from_node(x)
+
+    # Weight is (C_out, C_in/groups, kH, kW) in PyTorch
+    C_out, _, kH, kW = map(int, weight_shape)
+
+    # Output is NCHW; convert to NHWC dims.
+    N2, C_out2, H_out, W_out = shape_from_node(conv_node)
+
+    input_nhwc = [N, H, W, C_in]
+    filter_nhwc = [
+        C_out,
+        kH,
+        kW,
+        C_in,
+    ]  # CMSIS-NN convention: n=out_ch, h=kH, w=kW, c=in_ch
+    output_nhwc = [N2, H_out, W_out, C_out2]
+
+    stride_hw = [int(stride[0]), int(stride[1])]
+    padding_hw = [int(padding[0]), int(padding[1])]
+    dilation_hw = [int(dilation[0]), int(dilation[1])]
+
+    # CMSIS-NN conv_params offsets are "negative of zero point"
+    input_offset = -int(input_zero_point)
+    output_offset = -int(output_zero_point)
+
+    return int(
+        convolve_wrapper_s8_buffer_size_mve(
+            input_nhwc=input_nhwc,
+            filter_nhwc=filter_nhwc,
+            output_nhwc=output_nhwc,
+            padding_hw=padding_hw,
+            stride_hw=stride_hw,
+            dilation_hw=dilation_hw,
+            input_offset=input_offset,
+            output_offset=output_offset,
+            activation_min=int(output_qmin),
+            activation_max=int(output_qmax),
+        )
+    )
+
+
 class ConvertToCortexMPass(XNNPACKPass):
     """
     Cortex-M backend pass for replacing supported quantized kernels with Cortex-M
@@ -233,6 +309,31 @@ def _get_convolution_replacement(self, node) -> int:
                 torch.tensor(quantized_shifts, dtype=torch.int32),
             )
 
+        fake_size = 2000  # TODO add DW conv get buffer size function
+        if use_depthwise_conv:
+            required_bytes = fake_size
+        else:
+            weight_shape = get_first_fake_tensor(weight).shape
+            required_bytes = cmsisnn_conv_s8_required_bytes_mve(
+                x=x,
+                conv_node=node,
+                weight_shape=weight_shape,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                input_zero_point=input_zero_point,
+                output_zero_point=output_zero_point,
+                output_qmin=output_qmin,
+                output_qmax=output_qmax,
+            )
+        print("required_bytes = ", required_bytes)
+
+        graph = self.exported_program.graph_module.graph
+        with graph.inserting_before(node):
+            scratch = graph.call_function(
+                exir.memory.alloc, args=(((required_bytes,), torch.uint8),), kwargs={}
+            )
+
         if use_depthwise_conv:
             # Compute depth_multiplier for depthwise convolution
             # For depthwise: output_channels = input_channels * depth_multiplier
@@ -246,6 +347,7 @@ def _get_convolution_replacement(self, node) -> int:
 
             new_args = (
                 x,
+                scratch,
                 weight_nhwc,
                 bias,
                 stride,
@@ -264,6 +366,7 @@ def _get_convolution_replacement(self, node) -> int:
             # Use regular convolution operator
             new_args = (
                 x,
+                scratch,
                 weight_nhwc,
                 bias,
                 stride,
@@ -305,6 +408,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     kwargs={},
                 )
 
+                cortex_m_op.meta.update(node.meta)  # preserve shape for get buffer size
+
                 node.replace_all_uses_with(cortex_m_op)
                 graph_module.graph.erase_node(node)
 
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index 17d7a0c8898..9b799b8fe10 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -1,6 +1,6 @@
 /* Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
- * Copyright 2023-2025 Arm Limited and/or its affiliates.
+ * Copyright 2023-2026 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -256,11 +256,12 @@ const int num_inferences = 1;
  * ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE and
  * ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE
  */
-const size_t temp_allocation_pool_size =
-    ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE;
-unsigned char __attribute__((
-    section(".bss.tensor_arena"),
-    aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
+constexpr size_t temp_allocation_pool_size = 0;
+//    ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE;
+unsigned char* temp_allocation_pool = nullptr;
+// unsigned char __attribute__((
+//     section(".bss.tensor_arena"),
+//     aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
 #if defined(ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
 extern "C" {
 size_t ethosu_fast_scratch_size =
@@ -637,7 +638,10 @@ void runner_init(
   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
   ctx.planned_spans.reserve(num_memory_planned_buffers);
   size_t planned_buffer_membase = ctx.method_allocator->used_size();
-
+  ET_LOG(
+      Info,
+      "Method meta, has %zu instructions",
+      method_meta->num_instructions());
   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
     size_t buffer_size =
         static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());