Arm backend: Support a8w4 for convolution and linear. (#16577)

Erik-Lundell · web-flow · commit a9d823087a04 · 2026-01-14T13:55:08.000+01:00
Int4 is added as a TosaSpecialDtype. Since torch.dtype doesn't
have int4, the dtype is carried in an int8, quantized with
qmin=-7, qmax=7. This can then be detected when folding qparams
to add the tosa_special_dtype meta.
Some additional changes are needed to make sure the meta survives
all the way to TOSA.

Tests are added to conv2d, conv3d, depthwise conv, and linear.
One conv2d test case was actually dw conv, so it was modified.

Signed-off-by: Erik Lundell &lt;erik.lundell@arm.com&gt;
diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# Copyright 2024-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -24,6 +24,7 @@
 from executorch.backends.arm._passes.remove_noop_pass import RemoveNoopPass
 from executorch.backends.arm.common.annotation_meta import ArmAnnotationInfo
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir import ExportedProgram
 
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -32,6 +33,13 @@
 from torch.fx import GraphModule, Node
 
 
+def _get_special_dtype(qspec: QuantArgs) -> TosaSpecialDtype | None:
+    if qspec.dtype == torch.int8:
+        if qspec.qmax == 7 and qspec.qmin == -7:
+            return TosaSpecialDtype.INT4
+    return None
+
+
 def get_input_qparams(node: Node) -> dict[int, QuantArgs]:
     """
     Get the input quantization parameters from a node, set by the 'FoldAndAnnotateQParamsPass'.
@@ -157,6 +165,11 @@ def fold_and_annotate_arg(
                 node.replace_input_with(n, cast(Node, n.args[0]))
                 if len(n.users) == 0:
                     graph_module.graph.erase_node(n)
+            special_dtype = _get_special_dtype(input_qparams)
+            if special_dtype:
+                node.all_input_nodes[i].meta[
+                    TosaSpecialDtype.meta_key()
+                ] = special_dtype
 
     def _handle_control_flow_node(self, node: Node, graph_module: GraphModule):
         """Fold outmost quant nodes inside submodule.
diff --git a/backends/arm/_passes/fuse_constant_ops_pass.py b/backends/arm/_passes/fuse_constant_ops_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,6 +18,7 @@
 from executorch.backends.arm._passes.fuse_equal_placeholders_pass import (
     FuseEqualPlaceholdersPass,
 )
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
     delete_constant_placeholder,
@@ -52,6 +53,23 @@ def __init__(self, exported_program: ExportedProgram, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.exported_program = exported_program
 
+    def _propagate_special_dtype(self, from_nodes, to_node, data):
+        """Propagate special dtype meta if it exists."""
+        special_dtypes = set()
+        for input_node in from_nodes:
+            special_type = input_node.meta.get(TosaSpecialDtype.meta_key(), None)
+            if special_type:
+                special_dtypes.add(special_type)
+        if len(special_dtypes) > 1:
+            logger.warning(
+                "Propagating mixed special dtypes is not implemented, skipping."
+            )
+        elif len(special_dtypes) == 1:
+            special_dtype = list(special_dtypes)[0]
+            # Make sure data is still within special dtype range.
+            if data.abs().max() <= special_dtype.max():
+                to_node.meta[TosaSpecialDtype.meta_key()] = special_dtype
+
     def _fuse_nodes(self, node) -> bool:
         """
         Takes a node with only parameter inputs and replaces it with one constant tensor node with
@@ -105,6 +123,8 @@ def resolve_arg(arg):
                 persistent_buffer=persistent_buffer,
             )
 
+        self._propagate_special_dtype(input_nodes, const_node, data)
+
         node.replace_all_uses_with(const_node)
 
         return True
diff --git a/backends/arm/_passes/fuse_equal_placeholders_pass.py b/backends/arm/_passes/fuse_equal_placeholders_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -53,11 +53,11 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
             # ensure we don't merge any special case int48_t tensors with int32_t tensors
             # since int48_t tensors needs to be instantiated separately.
-            is_int48 = node.meta.get(TosaSpecialDtype.meta_key(), None)
+            is_special_dtype = node.meta.get(TosaSpecialDtype.meta_key(), None)
             t_cpu = tensor.detach().cpu().contiguous()
             data_bytes = t_cpu.numpy().tobytes()
             key = (
-                is_int48,
+                is_special_dtype,
                 str(t_cpu.dtype),
                 tuple(t_cpu.shape),
                 hashlib.sha1(data_bytes, usedforsecurity=False).hexdigest(),
diff --git a/backends/arm/ethosu/compile_spec.py b/backends/arm/ethosu/compile_spec.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -73,7 +73,7 @@ def __init__(
         compiler_flags.append(f"--memory-mode={memory_mode}")
 
         # Set TOSA version.
-        base_tosa_version = "TOSA-1.0+INT+int16"
+        base_tosa_version = "TOSA-1.0+INT+int16+int4"
         if "u55" in target_lower:
             # Add the Ethos-U55 extension marker
             base_tosa_version += "+u55"
diff --git a/backends/arm/operators/ops_identity.py b/backends/arm/operators/ops_identity.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -53,6 +53,8 @@ def define_node(
                 supported_dtypes += [ts.DType.FP32]
             if self.tosa_spec.support_extension("int16"):
                 supported_dtypes += [ts.DType.INT48]
+            if self.tosa_spec.support_extension("int4"):
+                supported_dtypes += [ts.DType.INT4]
             validate_valid_dtype(
                 self.target,
                 [inputs[0], output],
diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py
@@ -1,4 +1,4 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# Copyright 2024-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -12,7 +12,7 @@
 import torch.fx
 import tosa_serializer as ts
 from executorch.backends.arm.operators.node_visitor import NodeVisitor
-from executorch.backends.arm.tosa.mapping import TosaArg, TosaSpecialDtype
+from executorch.backends.arm.tosa.mapping import TosaArg
 from executorch.backends.arm.tosa.specification import TosaSpecification
 from executorch.backends.arm.tosa.utils import tosa_shape
 from torch._export.utils import (
@@ -116,21 +116,10 @@ def process_inputs_to_parameters(
         )
     parameter_values = parameter_data.detach().numpy()
 
-    if tosa_arg.dtype == torch.float32:
-        if not tosa_spec.support_float():
-            raise ValueError(f"{tosa_spec} doesn't support float operations")
-
-    # Handle special case for INT48 tensors
-    special_type = node.meta.get(TosaSpecialDtype.meta_key(), None)
-    if isinstance(special_type, TosaSpecialDtype):
-        tosa_dtype = special_type.get_tosa_dtype()
-    else:
-        tosa_dtype = tosa_arg.dtype
-
     parameter_values = np.transpose(parameter_values, tosa_arg.dim_order)
 
     tosa_graph.addConst(
-        parameter_values.shape, tosa_dtype, parameter_values, name=tosa_arg.name
+        parameter_values.shape, tosa_arg.dtype, parameter_values, name=tosa_arg.name
     )
 
 
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
@@ -180,6 +180,14 @@ def get_symmetric_quantization_config(
     return quantization_config
 
 
+def get_symmetric_a8w4_quantization_config(
+    is_per_channel: bool = True, is_qat: bool = True, is_dynamic: bool = False
+):
+    return get_symmetric_quantization_config(
+        is_per_channel, is_qat, is_dynamic, weight_qmin=-7, weight_qmax=7
+    )
+
+
 @functools.lru_cache
 def get_symmetric_a16w8_quantization_config(
     is_per_channel: bool = True,
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
@@ -1,4 +1,4 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# Copyright 2024-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -7,6 +7,9 @@
 from typing import List, Tuple, Union
 
 import torch
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    get_symmetric_a8w4_quantization_config,
+)
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineINT,
@@ -17,6 +20,7 @@
     VgfPipeline,
 )
 
+
 aten_op = "torch.ops.aten.conv2d.default"
 exir_op = "executorch_exir_dialects_edge__ops_aten_convolution_default"
 
@@ -162,8 +166,8 @@ def forward(self, x):
     batches=1,
 )
 
-conv2d_2x2_1x1x14x13_st2 = Conv2d(
-    in_channels=1,
+conv2d_2x2_2x1x14x13_st2 = Conv2d(
+    in_channels=2,
     out_channels=1,
     kernel_size=(2, 2),
     stride=2,
@@ -363,7 +367,7 @@ def forward(self, x):
     "3x3_1x3x24x24_st1": lambda: conv2d_3x3_1x3x24x24_st1,
     "3x3_1x3x12x12_st2_pd1": lambda: conv2d_3x3_1x3x12x12_st2_pd1,
     "1x1_1x2x16x16_st1": lambda: conv2d_1x1_1x2x16x16_st1,
-    "2x2_1x1x14x13_st2_needs_adjust_pass": lambda: conv2d_2x2_1x1x14x13_st2,
+    "2x2_2x1x14x13_st2_needs_adjust_pass": lambda: conv2d_2x2_2x1x14x13_st2,
     "5x5_1x3x14x15_st3_pd1_needs_adjust_pass": lambda: conv2d_5x5_1x3x14x15_st3_pd1,
     "7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass": lambda: conv2d_7x7_1x3x16x16_st2_pd1_dl2,
     "7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass": lambda: conv2d_7x7_1x3x15x15_st1_pd0_dl1,
@@ -391,6 +395,15 @@ def forward(self, x):
 input_t = Tuple[torch.Tensor]
 
 
+def _get_dtype_count(model: torch.nn.Module):
+    nbr_convs: int = model.nbr_convs  # noqa
+    return {
+        "CONST": {"INT4": nbr_convs * 2},  # One for the weight, one for the zp.
+        "CONV2D": {"INT32": nbr_convs},
+        "RESCALE": {"INT8": nbr_convs},
+    }
+
+
 @common.parametrize("test_data", test_data_FP)
 def test_convolution_2d_tosa_FP(test_data):
     model = test_data()
@@ -417,6 +430,36 @@ def test_convolution_2d_tosa_INT(test_data):
     pipeline.run()
 
 
+@common.parametrize(
+    "test_data",
+    test_data_INT,
+    xfails={
+        "groups,per_channel_quant=True": "Int4 not supported for grouped convolutions. MLETORCH-1726",
+        "groups,per_channel_quant=False": "Int4 not supported for grouped convolutions. MLETORCH-1726",
+        "groups_bias,per_channel_quant=True": "Int4 not supported for grouped convolutions. MLETORCH-1726",
+        "groups_bias,per_channel_quant=False": "Int4 not supported for grouped convolutions. MLETORCH-1726",
+    },
+)
+def test_convolution_2d_tosa_INT_a8w4(test_data):
+    model, per_channel_quantization = test_data()
+    pipeline = TosaPipelineINT[input_t](
+        model,
+        model.get_inputs(),
+        aten_op,
+        exir_op,
+        tosa_extensions=["int4"],
+    )
+    pipeline.quantizer.set_global(
+        get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)
+    )
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower",
+        pipeline.tester.check_dtype_count,
+        _get_dtype_count(model),
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", test_data_INT)
 @common.XfailIfNoCorstone300
 def test_convolution_2d_u55_INT(test_data):
@@ -431,6 +474,21 @@ def test_convolution_2d_u55_INT(test_data):
     pipeline.run()
 
 
+@common.parametrize("test_data", test_data_INT)
+def test_convolution_2d_u55_INT_a8w4(test_data):
+    model, per_channel_quantization = test_data()
+    pipeline = EthosU55PipelineINT[input_t](
+        model,
+        model.get_inputs(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.quantizer.set_global(
+        get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", test_data_INT)
 @common.XfailIfNoCorstone320
 def test_convolution_u85_INT(test_data):
@@ -445,6 +503,21 @@ def test_convolution_u85_INT(test_data):
     pipeline.run()
 
 
+@common.parametrize("test_data", test_data_INT)
+def test_convolution_2d_u85_INT_a8w4(test_data):
+    model, per_channel_quantization = test_data()
+    pipeline = EthosU85PipelineINT[input_t](
+        model,
+        model.get_inputs(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.quantizer.set_global(
+        get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", test_data_FP)
 @common.SkipIfNoModelConverter
 def test_convolution_2d_vgf_no_quant(test_data):
diff --git a/backends/arm/test/ops/test_conv3d.py b/backends/arm/test/ops/test_conv3d.py
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
diff --git a/backends/arm/tosa/mapping.py b/backends/arm/tosa/mapping.py