From c19770103c451915e46e61f8dc3d519c516903e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Fri, 9 Jan 2026 17:47:26 +0100 Subject: [PATCH] PoC Cortex_m backend: Add support for scratch buffers Use exir.memory.alloc for CMSIS-NN scratch buffers, which is ideal since it has a TensorSpec and gets memory planned but creates no additional operator overhead. Use CMSIS-NN pybind wrapper to get correct buffer size. Change-Id: Ia7ec8eda87833888a0639b480e531fd17818298a --- backends/cortex_m/ops/op_quantized_conv2d.cpp | 21 +++- .../ops/op_quantized_depthwise_conv2d.cpp | 23 +++- backends/cortex_m/ops/operators.py | 10 +- backends/cortex_m/ops/operators.yaml | 7 +- .../passes/convert_to_cortex_m_pass.py | 107 +++++++++++++++++- .../executor_runner/arm_executor_runner.cpp | 18 +-- 6 files changed, 171 insertions(+), 15 deletions(-) diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp index ad14af98865..d2106e4022a 100644 --- a/backends/cortex_m/ops/op_quantized_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2025 Arm Limited and/or its affiliates. + * Copyright 2025-2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -105,6 +105,7 @@ bool validate_conv2d_arguments( Tensor& quantized_conv2d_out( KernelRuntimeContext& context, const Tensor& input, + const Tensor& scratch, const Tensor& weight, const torch::executor::optional& bias, const IntArrayRef stride, @@ -190,6 +191,22 @@ Tensor& quantized_conv2d_out( const size_t buffer_bytes = static_cast( arm_convolve_s8_get_buffer_size(&input_dims, &filter_dims)); + +#if 1 + cmsis_context.buf = scratch.mutable_data_ptr(); + cmsis_context.size = buffer_bytes; + + if (scratch.nbytes() != buffer_bytes) { + ET_LOG( + Error, + "quantized_dw_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(scratch.nbytes()), + static_cast(buffer_bytes)); + return out; + } + +#else + if (buffer_bytes > 0) { auto buffer_or_error = context.allocate_temp(buffer_bytes, alignof(int16_t)); @@ -207,7 +224,7 @@ Tensor& quantized_conv2d_out( cmsis_context.size = buffer_bytes; } } - +#endif const arm_cmsis_nn_status status = arm_convolve_wrapper_s8( &cmsis_context, &conv_params, diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp index 9cf0625ec7b..dd2168bcf76 100644 --- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp @@ -1,6 +1,7 @@ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. + * Copyright 2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -143,6 +144,7 @@ bool validate_depthwise_conv2d_arguments( Tensor& quantized_depthwise_conv2d_out( KernelRuntimeContext& context, const Tensor& input, + const Tensor& scratch, const Tensor& weight, const torch::executor::optional& bias, const IntArrayRef stride, @@ -237,6 +239,25 @@ Tensor& quantized_depthwise_conv2d_out( return out; } +#if 1 + cmsis_context.buf = scratch.mutable_data_ptr(); + cmsis_context.size = buffer_bytes; + ET_LOG( + Info, + "quantized_dw_conv2d_out: scratch buffer size - actual: (%d) needed: (%d)", + static_cast(scratch.nbytes()), + static_cast(buffer_bytes)); + + if (scratch.nbytes() < buffer_bytes) { + ET_LOG( + Error, + "quantized_dw_conv2d_out: scratch buffer not big enough - actual: (%d) needed: (%d)", + static_cast(scratch.nbytes()), + static_cast(buffer_bytes)); + return out; + } +#else + auto buffer_or_error = context.allocate_temp( static_cast(buffer_bytes), alignof(int16_t)); if (!buffer_or_error.ok()) { @@ -250,7 +271,7 @@ Tensor& quantized_depthwise_conv2d_out( } cmsis_context.buf = buffer_or_error.get(); cmsis_context.size = buffer_bytes; - +#endif const arm_cmsis_nn_status status = arm_depthwise_conv_wrapper_s8( &cmsis_context, &dw_conv_params, diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index c62cac6a2be..24db11f4c9d 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -1,6 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2025 Arm Limited and/or its affiliates. +# Copyright 2025-2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -494,6 +494,7 @@ def transpose_impl(input: torch.Tensor, perm) -> torch.Tensor: lib.define( "quantized_conv2d(" "Tensor input, " + "Tensor scratch, " "Tensor weight, " "Tensor? bias, " "int[] stride, " @@ -512,6 +513,7 @@ def transpose_impl(input: torch.Tensor, perm) -> torch.Tensor: lib.define( "quantized_conv2d.out(" "Tensor input, " + "Tensor scratch, " "Tensor weight, " "Tensor? bias, " "int[] stride, " @@ -588,6 +590,7 @@ def _compute_depthwise_conv2d_output_shape( @register_fake("cortex_m::quantized_conv2d") def quantized_conv2d_meta( input: torch.Tensor, + scratch: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None, stride: Sequence[int], @@ -617,6 +620,7 @@ def quantized_conv2d_meta( @impl(lib, "quantized_conv2d", "CompositeExplicitAutograd") def quantized_conv2d_impl( input: torch.Tensor, + scratch: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None, stride: Sequence[int], @@ -683,6 +687,7 @@ def quantized_conv2d_impl( lib.define( "quantized_depthwise_conv2d(" "Tensor input, " + "Tensor scratch, " "Tensor weight, " "Tensor? bias, " "int[] stride, " @@ -702,6 +707,7 @@ def quantized_conv2d_impl( lib.define( "quantized_depthwise_conv2d.out(" "Tensor input, " + "Tensor scratch, " "Tensor weight, " "Tensor? bias, " "int[] stride, " @@ -722,6 +728,7 @@ def quantized_conv2d_impl( @register_fake("cortex_m::quantized_depthwise_conv2d") def quantized_depthwise_conv2d_meta( input: torch.Tensor, + scratch: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None, stride: Sequence[int], @@ -752,6 +759,7 @@ def quantized_depthwise_conv2d_meta( @impl(lib, "quantized_depthwise_conv2d", "CompositeExplicitAutograd") def quantized_depthwise_conv2d_impl( input: torch.Tensor, + scratch: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None, stride: Sequence[int], diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml index d1222c3c0b0..c9d6010eb8c 100644 --- a/backends/cortex_m/ops/operators.yaml +++ b/backends/cortex_m/ops/operators.yaml @@ -1,6 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2025 Arm Limited and/or its affiliates. +# Copyright 2025-2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -59,13 +59,14 @@ - arg_meta: null kernel_name: cortex_m::transpose_out -- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) +- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor scratch, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null kernel_name: cortex_m::quantized_conv2d_out -- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) + +- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor scratch, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index bed82c757bc..c23244c657e 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -1,15 +1,21 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2025 Arm Limited and/or its affiliates. +# Copyright 2025-2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import Sequence + import executorch.backends.cortex_m.ops.operators # noqa +import executorch.exir as exir import torch import torch.fx + +from cmsisnn_sizes import convolve_wrapper_s8_buffer_size_mve + from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot @@ -20,10 +26,80 @@ from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.tensor import TensorSpec from torch.export.graph_signature import InputKind from torch.fx.passes.infra.pass_manager import PassResult +def shape_from_node(n: torch.fx.Node) -> list[int]: + spec = n.meta.get("spec", None) + if spec is not None and getattr(spec, "shape", None) is not None: + return [int(s) for s in spec.shape] + + v = n.meta.get("val", None) + if v is not None: + if isinstance(v, (tuple, list)): + v = v[0] + return [int(s) for s in v.shape] + + raise KeyError(f"No shape meta on node {n.format_node()} (need spec or val)") + + +def cmsisnn_conv_s8_required_bytes_mve( + *, + x: torch.fx.Node, + conv_node: torch.fx.Node, + weight_shape: Sequence[int], + stride: Sequence[int], + padding: Sequence[int], + dilation: Sequence[int], + input_zero_point: int, + output_zero_point: int, + output_qmin: int, + output_qmax: int, +) -> int: + # Input is NCHW (PyTorch); CMSIS-NN wants NHWC dims. + N, C_in, H, W = shape_from_node(x) + + # Weight is (C_out, C_in/groups, kH, kW) in PyTorch + C_out, _, kH, kW = map(int, weight_shape) + + # Output is NCHW; convert to NHWC dims. + N2, C_out2, H_out, W_out = shape_from_node(conv_node) + + input_nhwc = [N, H, W, C_in] + filter_nhwc = [ + C_out, + kH, + kW, + C_in, + ] # CMSIS-NN convention: n=out_ch, h=kH, w=kW, c=in_ch + output_nhwc = [N2, H_out, W_out, C_out2] + + stride_hw = [int(stride[0]), int(stride[1])] + padding_hw = [int(padding[0]), int(padding[1])] + dilation_hw = [int(dilation[0]), int(dilation[1])] + + # CMSIS-NN conv_params offsets are "negative of zero point" + input_offset = -int(input_zero_point) + output_offset = -int(output_zero_point) + + return int( + convolve_wrapper_s8_buffer_size_mve( + input_nhwc=input_nhwc, + filter_nhwc=filter_nhwc, + output_nhwc=output_nhwc, + padding_hw=padding_hw, + stride_hw=stride_hw, + dilation_hw=dilation_hw, + input_offset=input_offset, + output_offset=output_offset, + activation_min=int(output_qmin), + activation_max=int(output_qmax), + ) + ) + + class ConvertToCortexMPass(XNNPACKPass): """ Cortex-M backend pass for replacing supported quantized kernels with Cortex-M @@ -233,6 +309,31 @@ def _get_convolution_replacement(self, node) -> int: torch.tensor(quantized_shifts, dtype=torch.int32), ) + fake_size = 2000 # TODO add DW conv get buffer size function + if use_depthwise_conv: + required_bytes = fake_size + else: + weight_shape = get_first_fake_tensor(weight).shape + required_bytes = cmsisnn_conv_s8_required_bytes_mve( + x=x, + conv_node=node, + weight_shape=weight_shape, + stride=stride, + padding=padding, + dilation=dilation, + input_zero_point=input_zero_point, + output_zero_point=output_zero_point, + output_qmin=output_qmin, + output_qmax=output_qmax, + ) + print("required_bytes = ", required_bytes) + + graph = self.exported_program.graph_module.graph + with graph.inserting_before(node): + scratch = graph.call_function( + exir.memory.alloc, args=(((required_bytes,), torch.uint8),), kwargs={} + ) + if use_depthwise_conv: # Compute depth_multiplier for depthwise convolution # For depthwise: output_channels = input_channels * depth_multiplier @@ -246,6 +347,7 @@ def _get_convolution_replacement(self, node) -> int: new_args = ( x, + scratch, weight_nhwc, bias, stride, @@ -264,6 +366,7 @@ def _get_convolution_replacement(self, node) -> int: # Use regular convolution operator new_args = ( x, + scratch, weight_nhwc, bias, stride, @@ -305,6 +408,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: kwargs={}, ) + cortex_m_op.meta.update(node.meta) # preserve shape for get buffer size + node.replace_all_uses_with(cortex_m_op) graph_module.graph.erase_node(node) diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index 17d7a0c8898..9b799b8fe10 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -1,6 +1,6 @@ /* Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. - * Copyright 2023-2025 Arm Limited and/or its affiliates. + * Copyright 2023-2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -256,11 +256,12 @@ const int num_inferences = 1; * ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE and * ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE */ -const size_t temp_allocation_pool_size = - ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE; -unsigned char __attribute__(( - section(".bss.tensor_arena"), - aligned(16))) temp_allocation_pool[temp_allocation_pool_size]; +constexpr size_t temp_allocation_pool_size = 0; +// ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE; +unsigned char* temp_allocation_pool = nullptr; +// unsigned char __attribute__(( +// section(".bss.tensor_arena"), +// aligned(16))) temp_allocation_pool[temp_allocation_pool_size]; #if defined(ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE) extern "C" { size_t ethosu_fast_scratch_size = @@ -637,7 +638,10 @@ void runner_init( size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); ctx.planned_spans.reserve(num_memory_planned_buffers); size_t planned_buffer_membase = ctx.method_allocator->used_size(); - + ET_LOG( + Info, + "Method meta, has %zu instructions", + method_meta->num_instructions()); for (size_t id = 0; id < num_memory_planned_buffers; ++id) { size_t buffer_size = static_cast(method_meta->memory_planned_buffer_size(id).get());