diff --git a/backends/arm/public_api_manifests/api_manifest_running.toml b/backends/arm/public_api_manifests/api_manifest_running.toml index 0b096102100..f01128058e6 100644 --- a/backends/arm/public_api_manifests/api_manifest_running.toml +++ b/backends/arm/public_api_manifests/api_manifest_running.toml @@ -62,7 +62,7 @@ signature = "EthosUPartitioner.register_custom_partition_op(self, op: torch._ops [python.EthosUQuantizer] kind = "class" -signature = "EthosUQuantizer(compile_spec: 'EthosUCompileSpec', use_composable_quantizer: 'bool' = False) -> 'None'" +signature = "EthosUQuantizer(compile_spec: 'EthosUCompileSpec', use_composable_quantizer: 'bool' = True) -> 'None'" [python.EthosUQuantizer.annotate] kind = "function" @@ -146,7 +146,7 @@ signature = "VgfPartitioner.register_custom_partition_op(self, op: torch._ops.Op [python.VgfQuantizer] kind = "class" -signature = "VgfQuantizer(compile_spec: 'VgfCompileSpec', use_composable_quantizer: 'bool' = False) -> 'None'" +signature = "VgfQuantizer(compile_spec: 'VgfCompileSpec', use_composable_quantizer: 'bool' = True) -> 'None'" [python.VgfQuantizer.annotate] kind = "function" diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index f1dfb5f1323..9bf6c3530e2 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -470,21 +470,23 @@ class TOSAQuantizer(Quantizer): """Manage quantization annotations for TOSA-compatible backends. .. warning:: - Setting ``use_composable_quantizer=True`` enables an experimental API - surface that may change without notice. + The composable quantizer is now the default implementation. Setting + ``use_composable_quantizer=False`` is deprecated and will be removed in + two minor releases. """ def __init__( self, compile_spec_or_tosa_spec, - use_composable_quantizer: bool = False, + use_composable_quantizer: bool = True, ) -> None: """Create a TOSA quantizer from a TOSA spec or Arm compile spec. .. warning:: - Setting ``use_composable_quantizer=True`` enables an experimental - API surface that may change without notice. + The composable quantizer is now the default implementation. + Setting ``use_composable_quantizer=False`` is deprecated and will + be removed in two minor releases. """ self.use_composable_quantizer = use_composable_quantizer @@ -496,7 +498,7 @@ def __init__( self.quantizer = _TOSAQuantizerV2(compile_spec_or_tosa_spec) else: logger.info( - "Using default quantizer in the arm backend. This quantizer is planned to be replaced by the composable quantizer implementation in the future, see https://github.com/pytorch/executorch/issues/17701" + "Using deprecated legacy quantizer implementation in the arm backend. Setting use_composable_quantizer=False will be removed in two minor releases. See https://github.com/pytorch/executorch/issues/17701" ) self.quantizer = _TOSAQuantizerV1(compile_spec_or_tosa_spec) @@ -1239,20 +1241,25 @@ class EthosUQuantizer(TOSAQuantizer): """Quantizer supported by the Arm Ethos-U backend. .. warning:: - Setting ``use_composable_quantizer=True`` enables an experimental API - surface that may change without notice. + The composable quantizer is now the default implementation. Setting + ``use_composable_quantizer=False`` is deprecated and will be removed in + two minor releases. Args: compile_spec (EthosUCompileSpec): Backend compile specification for Ethos-U targets. - use_composable_quantizer (bool): Whether to use the composable quantizer implementation. See https://github.com/pytorch/executorch/issues/17701" for details. + use_composable_quantizer (bool): Whether to use the composable + quantizer implementation. Setting this to ``False`` is deprecated + and will be removed in two minor releases. See + [issue #17701](https://github.com/pytorch/executorch/issues/17701) + for details. """ def __init__( self, compile_spec: EthosUCompileSpec, - use_composable_quantizer: bool = False, + use_composable_quantizer: bool = True, ) -> None: super().__init__(compile_spec, use_composable_quantizer) @@ -1261,19 +1268,24 @@ class VgfQuantizer(TOSAQuantizer): """Quantizer supported by the Arm Vgf backend. .. warning:: - Setting ``use_composable_quantizer=True`` enables an experimental API - surface that may change without notice. + The composable quantizer is now the default implementation. Setting + ``use_composable_quantizer=False`` is deprecated and will be removed in + two minor releases. Args: compile_spec (VgfCompileSpec): Backend compile specification for Vgf targets. - use_composable_quantizer (bool): Whether to use the composable quantizer implementation. See https://github.com/pytorch/executorch/issues/17701" for details. + use_composable_quantizer (bool): Whether to use the composable + quantizer implementation. Setting this to ``False`` is deprecated + and will be removed in two minor releases. See + [issue #17701](https://github.com/pytorch/executorch/issues/17701) + for details. """ def __init__( self, compile_spec: VgfCompileSpec, - use_composable_quantizer: bool = False, + use_composable_quantizer: bool = True, ) -> None: super().__init__(compile_spec, use_composable_quantizer) diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index 190e8a57cd8..d4c2dfebdee 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -243,6 +243,18 @@ class PatternQuantizer(Quantizer, QuantizerReporterUser): """ + PARAMETER_TARGETS = { + torch.ops.aten.linear.default, + torch.ops.aten.convolution.default, + torch.ops.aten.conv1d.default, + torch.ops.aten.conv1d.padding, + torch.ops.aten.conv2d.default, + torch.ops.aten.conv2d.padding, + torch.ops.aten.conv3d.default, + torch.ops.aten.conv3d.padding, + torch.ops.aten.conv_transpose2d.input, + } + def __init__( self, quantization_config: QuantizationConfig | None, @@ -275,75 +287,59 @@ def get_quantizer_info(self): support_config_path, ) - def is_parameter(self, node: Node, model: torch.fx.GraphModule) -> bool: - """Returns True if the given node is a parameter of the model.""" - try: - _ = model.get_parameter(node.target) # type: ignore[arg-type] - return True - except Exception: + def is_weight(self, node: Node) -> bool: + """Returns True if node is used as a weight by all users.""" + if node.op != "get_attr": return False - def is_weight( - self, node: Node, params: list[Node], model: torch.fx.GraphModule - ) -> bool: - """Returns True if node is the first parameter of the given - parameters. - """ - return len(params) > 0 and node == params[0] + # Ensure that the node is used as a weight by all users + for user_node in node.users: + if user_node.target not in self.PARAMETER_TARGETS: + return False - def is_bias( - self, node: Node, params: list[Node], model: torch.fx.GraphModule - ) -> bool: - """Returns True if node is the second parameter of the given - parameters. - """ - return len(params) == 2 and node == params[1] + args = list(user_node.args) + if not (len(args) > 1 and node == args[1]): + return False + + return True + + def is_bias(self, node: Node) -> bool: + """Returns True if node is used as a bias by all users.""" + if node.op != "get_attr": + return False + + # Ensure that the node is used as a bias by all users + for user_node in node.users: + if user_node.target not in self.PARAMETER_TARGETS: + return False + + args = list(user_node.args) + if not (len(args) > 2 and node == args[2]): + return False + + return True def annotate_match( self, match: list[Node], config: QuantizationConfig | None, - model: torch.fx.GraphModule, ) -> None: """Annotates a matched pattern according to the given quantization config. """ - parameter_targets = { - torch.ops.aten.linear.default, - torch.ops.aten.convolution.default, - torch.ops.aten.conv1d.default, - torch.ops.aten.conv1d.padding, - torch.ops.aten.conv2d.default, - torch.ops.aten.conv2d.padding, - torch.ops.aten.conv3d.default, - torch.ops.aten.conv3d.padding, - torch.ops.aten.conv_transpose2d.input, - } for node in match: input_qspec_map = {} output_qspec = None - params = [n for n in node.all_input_nodes if self.is_parameter(n, model)] - if node.target in parameter_targets: - if len(params) == 0 or len(params) > 2: - logger.warning( - f"{node.name} is expected to have parameter tensors for weight/bias but no such inputs found, which may cause unexpected quantization annotations. This is likely caused by incorrect tensor instantiations or non-constant weight/biases." - ) - else: - if len(params) > 0: - logger.warning( - f"{node.name} is not expected to not have parameter tensors but found {[n.name for n in params]}, which may cause unexpected quantization annotations." - ) - for input_node in node.all_input_nodes: if not has_float_output(input_node): continue - if self.is_weight(input_node, params, model): + if self.is_weight(input_node): input_qspec_map[input_node] = ( config.get_weight_qspec(node) if config else None ) - elif self.is_bias(input_node, params, model): + elif self.is_bias(input_node): input_qspec_map[input_node] = ( config.get_bias_qspec(node) if config else None # type: ignore[assignment] ) @@ -370,7 +366,7 @@ def annotate(self, model: torch.fx.GraphModule) -> None: # type: ignore[overrid ) for result in matches: if result.accepted: - self.annotate_match(result.pattern, self.quantization_config, model) + self.annotate_match(result.pattern, self.quantization_config) self.report_accept(result.pattern) else: self.report_reject( @@ -424,6 +420,9 @@ class SharedQspecQuantizer(Quantizer, QuantizerReporterUser): torch.ops.aten.flip.default, torch.ops.aten.index_select.default, torch.ops.aten.index_put.default, + torch.ops.aten.index_put_.default, + torch.ops.aten.index_copy.default, + torch.ops.aten.index_copy_.default, torch.ops.aten.contiguous.default, torch.ops.aten.as_strided_copy.default, torch.ops.aten.pixel_shuffle.default, @@ -571,6 +570,42 @@ def _get_shared_clique(self, root_node: Node) -> tuple[set[Node], list[Any]]: return shared_nodes, adjacent_qspecs + def _should_skip_while_shared_qspec(self, node: Node) -> bool: + return node.target == torch.ops.higher_order.while_loop and bool( + node.meta.get("additional_inputs") + ) + + def _annotate_while_with_additional_inputs( + self, + root_node: Node, + adjacent_qspecs: list[Any], + ) -> bool: + if not self._should_skip_while_shared_qspec(root_node): + return False + if len(adjacent_qspecs) == 0: + self.report_reject( + [root_node], + "Couldn't find any adjacent quantization spec to annotate while_loop.", + ) + return True + + input_qspec = adjacent_qspecs[0] + input_qspec_map: dict[Node, Optional[QuantizationSpec]] = { + n: input_qspec for n in self._get_input_nodes_with_float_output(root_node) + } + output_qspec: Optional[QuantizationSpec] = None + if len(self._get_user_nodes_with_float_input(root_node)) > 0: + output_qspec = input_qspec + + _mark_node_as_quantized( + root_node, + input_qspec_map, + output_qspec, + is_quantized=True, + ) + self.report_accept([root_node]) + return True + def _annotate_shared_cluster(self, root_node: Node) -> None: if ( len(self._get_input_nodes_with_float_output(root_node)) == 0 @@ -592,9 +627,11 @@ def _annotate_shared_cluster(self, root_node: Node) -> None: node_order = {node: index for index, node in enumerate(root_node.graph.nodes)} ordered_nodes = sorted(shared_nodes, key=lambda node: node_order.get(node, 0)) + if self._annotate_while_with_additional_inputs(root_node, adjacent_qspecs): + return + # Ensure the root node is the first one in the graph. root_node = ordered_nodes[0] - if len(adjacent_qspecs) > 0: root_node_float_inputs = self._get_input_nodes_with_float_output(root_node) if len(root_node_float_inputs) > 0: diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py index d06203cede3..0c64d147c84 100644 --- a/backends/arm/quantizer/quantization_config.py +++ b/backends/arm/quantizer/quantization_config.py @@ -21,6 +21,7 @@ from torchao.quantization.pt2e.quantizer import ( DerivedQuantizationSpec, + FixedQParamsQuantizationSpec, QuantizationSpec, QuantizationSpecBase, SharedQuantizationSpec, @@ -284,10 +285,18 @@ def get_input_act_qspec(self, node=None, input_node=None): For comparison operators, make sure that both inputs share the same quantization spec, by returning a SharedQuantizationSpec that ties the - quantization of both inputs together. For other operators, return the - default input activation spec. + quantization of both inputs together. + + For trigonometric ops, ensure that input spec has fixed qparams. + + For other operators, return the default input activation spec. """ + # MLETORCH-1853: Fix lazy import when moving files around + from executorch.backends.arm.quantizer.quantization_annotator import ( + _fixed_input_qspec_ops, + ) + if node is None or input_node is None: return super().get_input_act_qspec(node, input_node) @@ -296,6 +305,29 @@ def get_input_act_qspec(self, node=None, input_node=None): return super().get_input_act_qspec(node, input_node) else: return SharedQuantizationSpec((node.args[0], node)) + elif node.target in _fixed_input_qspec_ops: + + input_act_qspec = super().get_input_act_qspec(node, input_node) + if not hasattr(input_act_qspec, "dtype") or not isinstance( + input_act_qspec.dtype, torch.dtype + ): + raise ValueError( + f"{node.target} requires an input activation quantization " + "spec to use fixed input qparams." + ) + dtype = getattr(input_act_qspec, "dtype", None) + num_bits = torch.iinfo(dtype).bits + + qparams = _fixed_input_qspec_ops[node.target][num_bits] + return FixedQParamsQuantizationSpec( + dtype=dtype, + scale=qparams.scale, + zero_point=qparams.zero_point, + quant_min=input_act_qspec.quant_min, + quant_max=input_act_qspec.quant_max, + qscheme=input_act_qspec.qscheme, + is_dynamic=input_act_qspec.is_dynamic, + ) return super().get_input_act_qspec(node, input_node) diff --git a/backends/arm/quantizer/quantizer_support.py b/backends/arm/quantizer/quantizer_support.py index bb3ea158fba..d6a725c2b06 100644 --- a/backends/arm/quantizer/quantizer_support.py +++ b/backends/arm/quantizer/quantizer_support.py @@ -77,8 +77,6 @@ def check_pattern(cls, pattern): torch.ops.aten.relu_.default, torch.ops.aten.hardtanh.default, torch.ops.aten.hardtanh_.default, - torch.ops.aten.hardsigmoid.default, - torch.ops.aten.hardsigmoid_.default, torch.ops.aten.clamp.default, torch.ops.aten.clamp_.default, ] @@ -168,6 +166,14 @@ def check_pattern(cls, pattern): (torch.ops.aten.ge.Scalar,), (torch.ops.aten.eq.Scalar,), (torch.ops.aten.ne.Scalar,), + (torch.ops.aten.lstm.input,), + (torch.ops.aten.rnn_tanh.input,), + (torch.ops.aten.rnn_relu.input,), + (torch.ops.aten.gru.input,), + (torch.ops.aten.asin.default,), + (torch.ops.aten.acos.default,), + (torch.ops.aten.atanh.default,), + (torch.ops.aten.einsum.default,), ] ) TOSA_QUANTIZER_SUPPORT_DICT: dict[tuple[OpOverload, ...], type[PatternCheck] | None] = { diff --git a/backends/arm/scripts/docgen/docgen.py b/backends/arm/scripts/docgen/docgen.py index 75baf3e8e40..c0b708bdb5e 100644 --- a/backends/arm/scripts/docgen/docgen.py +++ b/backends/arm/scripts/docgen/docgen.py @@ -46,7 +46,9 @@ def get_docstring(obj) -> str: lines = docstring.split("\n") for line in lines: - if ":" in line and line.startswith(" "): + # Only first-level arg lines should become bullets. + is_arg_line = line.startswith(" ") and not line.startswith(" ") + if ":" in line and is_arg_line: new_line = line.strip() pos = new_line.index(":") new_line = f"- **{new_line[:pos]}**" + new_line[pos:] diff --git a/backends/arm/test/misc/test_quant_custom_meta.py b/backends/arm/test/misc/test_quant_custom_meta.py index cd9964f4511..f64b8067098 100644 --- a/backends/arm/test/misc/test_quant_custom_meta.py +++ b/backends/arm/test/misc/test_quant_custom_meta.py @@ -105,5 +105,6 @@ def test_quantized_to_float_transition_tosa_INT_FP(fp_extension: bool): ) pipeline.quantizer.set_module_type(torch.nn.Sigmoid, None) # type: ignore pipeline.quantizer.set_module_type(torch.nn.Conv1d, None) # type: ignore + pipeline.quantizer.set_io(None) # type: ignore pipeline.run() diff --git a/backends/arm/test/misc/test_shared_qspecs.py b/backends/arm/test/misc/test_shared_qspecs.py index de07bd5f6c2..93129633418 100644 --- a/backends/arm/test/misc/test_shared_qspecs.py +++ b/backends/arm/test/misc/test_shared_qspecs.py @@ -87,8 +87,8 @@ class SharedQspecMulipleClusters(torch.nn.Module): "quantized_decomposed.dequantize_per_tensor.default": {None: 8}, "aten.add.Tensor": {_INT8_QSPEC: 2}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.015678614, 0, -128, 127, torch.int8): 2, @@ -122,8 +122,8 @@ class SharedQspecInputForkNonShared(torch.nn.Module): "quantized_decomposed.quantize_per_tensor.default": {None: 4}, "quantized_decomposed.dequantize_per_tensor.default": {None: 4}, } - inputs_qspecs = {None: 2} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 2} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.015678614, -64, -128, 127, torch.int8): 3, @@ -149,8 +149,8 @@ class SharedQspecInputForkShared(torch.nn.Module): "quantized_decomposed.quantize_per_tensor.default": {None: 5}, "quantized_decomposed.dequantize_per_tensor.default": {None: 5}, } - inputs_qspecs = {None: 2} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 2} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.015678614, -64, -128, 127, torch.int8): 2, @@ -178,8 +178,8 @@ class SharedQspecInputForkXShared(torch.nn.Module): "quantized_decomposed.quantize_per_tensor.default": {None: 4}, "quantized_decomposed.dequantize_per_tensor.default": {None: 4}, } - inputs_qspecs = {None: 2} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 2} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.015678614, -64, -128, 127, torch.int8): 2, @@ -206,8 +206,8 @@ class SharedQspecInputForkYShared(torch.nn.Module): "quantized_decomposed.quantize_per_tensor.default": {None: 5}, "quantized_decomposed.dequantize_per_tensor.default": {None: 5}, } - inputs_qspecs = {None: 2} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 2} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.015678614, -64, -128, 127, torch.int8): 2, @@ -234,8 +234,8 @@ class SharedQspecInputForkXConstant(torch.nn.Module): "quantized_decomposed.quantize_per_tensor.default": {None: 2}, "quantized_decomposed.dequantize_per_tensor.default": {None: 3}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.015678614, 0, -128, 127, torch.int8): 2, @@ -260,8 +260,8 @@ class SharedQspecInputForkYConstant(torch.nn.Module): "quantized_decomposed.quantize_per_tensor.default": {None: 2}, "quantized_decomposed.dequantize_per_tensor.default": {None: 3}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.015678614, 0, -128, 127, torch.int8): 1, @@ -287,8 +287,8 @@ class SharedQspecOutputForkNonShared(torch.nn.Module): "quantized_decomposed.dequantize_per_tensor.default": {None: 4}, "aten.add.Tensor": {_INT8_QSPEC: 1}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 2} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.015678614, 0, -128, 127, torch.int8): 3, @@ -315,8 +315,8 @@ class SharedQspecOutputForkShared(torch.nn.Module): "quantized_decomposed.quantize_per_tensor.default": {None: 4}, "quantized_decomposed.dequantize_per_tensor.default": {None: 6}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 3} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.015678614, 0, -128, 127, torch.int8): 6, @@ -341,10 +341,10 @@ class SharedQspecManyForks(torch.nn.Module): qspecs = { "quantized_decomposed.quantize_per_tensor.default": {None: 6}, "quantized_decomposed.dequantize_per_tensor.default": {None: 9}, - "aten.t.default": {None: 1}, + "aten.t.default": {_INT8_QSPEC: 1}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.086232387, 104, -128, 127, torch.int8): 9, @@ -372,8 +372,8 @@ class SharedQspecSurroundedQuantizedOp(torch.nn.Module): "quantized_decomposed.dequantize_per_tensor.default": {None: 5}, "aten.add.Tensor": {_INT8_QSPEC: 1}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.509554982, 123, -128, 127, torch.int8): 3, @@ -403,8 +403,8 @@ class SharedQspecSurroundedQuantizedOpConstant(torch.nn.Module): "aten.ones.default": {_INT8_QSPEC: 1}, "aten.add.Tensor": {_INT8_QSPEC: 1}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { (0.003921569, -128, -128, 127, torch.int8): 1, @@ -429,18 +429,22 @@ class SharedQspecSub(torch.nn.Module): """A shared qspec node with float input.""" qspecs = { - "quantized_decomposed.quantize_per_tensor.default": {None: 2}, - "quantized_decomposed.dequantize_per_tensor.default": {None: 2}, + "quantized_decomposed.quantize_per_tensor.default": {None: 4}, + "quantized_decomposed.dequantize_per_tensor.default": {None: 4}, "aten.sub.Tensor": {None: 1}, } - inputs_qspecs = {None: 2} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 2} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { + (0.003919654, -128, -128, 127, torch.int8): 1, (0.035276882, -128, -128, 127, torch.int8): 2, + (0.03919654, -128, -128, 127, torch.int8): 1, }, "quantized_decomposed.quantize_per_tensor.default": { + (0.003919654, -128, -128, 127, torch.int8): 1, (0.035276882, -128, -128, 127, torch.int8): 2, + (0.03919654, -128, -128, 127, torch.int8): 1, }, } @@ -462,8 +466,8 @@ class SharedQspecCompetingQspecs(torch.nn.Module): "quantized_decomposed.dequantize_per_tensor.default": {None: 4}, "aten.conv2d.default": {_INT8_QSPEC: 1}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_channel.default": { (0, -2147483647, 2147483647, torch.int32): 1, @@ -502,20 +506,16 @@ class SharedQspecNoQspecs(torch.nn.Module): "quantized_decomposed.dequantize_per_tensor.default": {None: 2}, "aten.sub.Tensor": {None: 2}, } - inputs_qspecs = {None: 1} - outputs_qspecs = {None: 1} + inputs_qspecs = {_INT8_QSPEC: 1} + outputs_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.dequantize_per_tensor.default": { - ( - 1.5259e-05, - -128, - -128, - 127, - torch.int8, - ): 2, # The network always has 0 output -> very small scale. + (1.5259e-05, -128, -128, 127, torch.int8): 1, + (0.03919654, -128, -128, 127, torch.int8): 1, }, "quantized_decomposed.quantize_per_tensor.default": { - (1.5259e-05, -128, -128, 127, torch.int8): 2, + (1.5259e-05, -128, -128, 127, torch.int8): 1, + (0.03919654, -128, -128, 127, torch.int8): 1, }, } @@ -542,21 +542,19 @@ class MixedMaximumInt8Int16(torch.nn.Module): """A shared qspec node with int16/int8 inputs.""" qspecs = { - "quantized_decomposed.quantize_per_tensor.default": {None: 6}, - "quantized_decomposed.dequantize_per_tensor.default": {None: 6}, + "quantized_decomposed.quantize_per_tensor.default": {None: 4}, + "quantized_decomposed.dequantize_per_tensor.default": {None: 5}, } - input_qspecs = {None: 1} - output_qspecs = {None: 1} + input_qspecs = {_INT8_QSPEC: 1} + output_qspecs = {_INT8_QSPEC: 1} quant_params = { "quantized_decomposed.quantize_per_tensor.default": { - (0.007839307, -128, -128, 127, torch.int8): 2, - (0.015678614, 0, -128, 127, torch.int8): 2, - (0.000244141, 0, -32767, 32767, torch.int16): 2, + (0.007839307, -128, -128, 127, torch.int8): 1, + (0.015678614, 0, -128, 127, torch.int8): 3, }, "quantized_decomposed.dequantize_per_tensor.default": { - (0.007839307, -128, -128, 127, torch.int8): 2, - (0.015678614, 0, -128, 127, torch.int8): 2, - (0.000244141, 0, -32767, 32767, torch.int16): 2, + (0.007839307, -128, -128, 127, torch.int8): 1, + (0.015678614, 0, -128, 127, torch.int8): 4, }, } diff --git a/backends/arm/test/ops/test_to_copy.py b/backends/arm/test/ops/test_to_copy.py index 6718fedea04..e0d910bd069 100644 --- a/backends/arm/test/ops/test_to_copy.py +++ b/backends/arm/test/ops/test_to_copy.py @@ -330,18 +330,14 @@ def test_to_vgf_quant(test_data: Tuple): ), } -redundant_xfails_FP = { +redundant_xfails = { "rand_int8_int8": "Tracing graph with quantized input is not supported.", "rand_int16_int16": "Tracing graph with quantized input is not supported.", } -redundant_xfails_INT = redundant_xfails_FP | { - "rand_fp16_fp16": "FP16 is not supported", -} - @common.parametrize( - "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails_FP + "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails ) def test_to_tosa_FP_REDUNDANT_CAST(test_data: Tuple): test_tensor, new_dtype = test_data() @@ -356,7 +352,7 @@ def test_to_tosa_FP_REDUNDANT_CAST(test_data: Tuple): @common.parametrize( - "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails_INT + "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails ) def test_to_tosa_INT_REDUNDANT_CAST(test_data: Tuple): test_tensor, new_dtype = test_data() diff --git a/backends/arm/test/ops/test_transpose_conv2d.py b/backends/arm/test/ops/test_transpose_conv2d.py index 1ab077841b6..a288cc3ebac 100644 --- a/backends/arm/test/ops/test_transpose_conv2d.py +++ b/backends/arm/test/ops/test_transpose_conv2d.py @@ -7,14 +7,14 @@ import conftest import torch - -from executorch.backends.arm.quantizer import QuantizationConfig from executorch.backends.arm.quantizer.arm_quantizer import ( get_symmetric_a16w8_quantization_config, get_symmetric_a8w4_quantization_config, get_symmetric_quantization_config, TOSAQuantizer, ) + +from executorch.backends.arm.quantizer.quantization_config import TOSAQuantizationConfig from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.test_pipeline import ( EthosU55PipelineINT, @@ -311,7 +311,7 @@ def test_conv_transpose2d_tosa_INT_qat_axis1_uses_non_fused_fake_quant(test_data ), ) quantizer.set_global( - QuantizationConfig( + TOSAQuantizationConfig( input_activation=activation_qspec, output_activation=activation_qspec, weight=weight_qspec, @@ -350,7 +350,7 @@ def test_conv_transpose2d_tosa_INT_grouped_qat_axis0_keeps_fused_fake_quant(test ), ) quantizer.set_global( - QuantizationConfig( + TOSAQuantizationConfig( input_activation=activation_qspec, output_activation=activation_qspec, weight=weight_qspec, @@ -389,7 +389,7 @@ def test_conv_transpose2d_tosa_INT_ptq_observer_updates_axis(test_data): ), ) quantizer.set_global( - QuantizationConfig( + TOSAQuantizationConfig( input_activation=activation_qspec, output_activation=activation_qspec, weight=weight_qspec, @@ -427,7 +427,7 @@ def test_conv_transpose2d_tosa_INT_qat_correct_qspec_wrong_ctor_axis(test_data): ), ) quantizer.set_global( - QuantizationConfig( + TOSAQuantizationConfig( input_activation=activation_qspec, output_activation=activation_qspec, weight=weight_qspec, diff --git a/backends/arm/test/ops/test_unary_combos.py b/backends/arm/test/ops/test_unary_combos.py index bc4bb0b39d9..2ecd04b9c79 100644 --- a/backends/arm/test/ops/test_unary_combos.py +++ b/backends/arm/test/ops/test_unary_combos.py @@ -104,9 +104,7 @@ def test_add_tensor_tosa_INT_combos(model_cls): @common.XfailIfNoCorstone300 -@common.parametrize( - "model_cls", MODEL_DATA, xfails={"NegAdd": "Numerical failure. MLBEDSW-11581"} -) +@common.parametrize("model_cls", MODEL_DATA) def test_add_tensor_u55_INT_combos(model_cls): m, inputs, exir = _build(model_cls) p = EthosU55PipelineINT[Tensor1]( diff --git a/backends/arm/test/ops/test_while.py b/backends/arm/test/ops/test_while.py index b5cab047a50..51b56661b50 100644 --- a/backends/arm/test/ops/test_while.py +++ b/backends/arm/test/ops/test_while.py @@ -8,6 +8,8 @@ import torch import torch.fx +from executorch.backends.arm.quantizer import get_symmetric_quantization_config +from executorch.backends.arm.quantizer.arm_quantizer import _TOSAQuantizerV2 from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.arm.test.tester.test_pipeline import ( @@ -228,6 +230,28 @@ def test_while_loop_tosa_INT(case: Callable[[], Tuple[torch.nn.Module, Tuple]]): pipeline.run() +def test_while_loop_tosa_INT_composable_large_threshold(): + module, example_inputs = test_cases["large_threshold"]() + pipeline = TosaPipelineINT[tuple]( + module, + example_inputs, + "torch.ops.higher_order.while_loop", + tosa_extensions=["cf"], + ) + + composable_quantizer = _TOSAQuantizerV2(pipeline.tester.compile_spec) + composable_quantizer.set_global(get_symmetric_quantization_config()) + pipeline.quantizer.quantizer = composable_quantizer + + pipeline.add_stage_after( + "to_edge_transform_and_lower", + ArmTester.check_not, + pipeline.tester, + ["torch.ops.higher_order.while_loop"], + ) + pipeline.run() + + @common.parametrize( "case", test_cases, diff --git a/backends/cortex_m/test/misc/test_portable_int8.py b/backends/cortex_m/test/misc/test_portable_int8.py index 4e3b5f41561..920b4200e60 100644 --- a/backends/cortex_m/test/misc/test_portable_int8.py +++ b/backends/cortex_m/test/misc/test_portable_int8.py @@ -301,6 +301,36 @@ def _quantize_and_export( (torch.randn(6), torch.randn(6)), torch.int64, ), + "index_put_": OpCase( + torch.ops.aten.index_put_.default, + _build_module( + lambda x, y: torch.ops.aten.index_put_.default( + x, (torch.tensor([1, 3]),), torch.tensor([1.0, 2.0]), False + ) + ), + (torch.randn(6), torch.randn(6)), + torch.int64, + ), + "index_copy": OpCase( + torch.ops.aten.index_copy.default, + _build_module( + lambda x, y: torch.ops.aten.index_copy.default( + x, 0, torch.tensor([0, 2]), y + ) + ), + (torch.randn(4, 5), torch.randn(2, 5)), + torch.int64, + ), + "index_copy_": OpCase( + torch.ops.aten.index_copy_.default, + _build_module( + lambda x, y: torch.ops.aten.index_copy_.default( + x, 0, torch.tensor([0, 2]), y + ) + ), + (torch.randn(4, 5), torch.randn(2, 5)), + torch.int64, + ), "contiguous": OpCase( torch.ops.aten.contiguous.default, _build_module(lambda x, y: torch.ops.aten.contiguous.default(x)), diff --git a/docs/source/backends/arm-ethos-u/arm-ethos-u-quantization.md b/docs/source/backends/arm-ethos-u/arm-ethos-u-quantization.md index 68fe9d160aa..c2f7035c89c 100644 --- a/docs/source/backends/arm-ethos-u/arm-ethos-u-quantization.md +++ b/docs/source/backends/arm-ethos-u/arm-ethos-u-quantization.md @@ -16,18 +16,23 @@ The Arm Ethos-U delegate supports the following quantization schemes: ### Quantization API ```python -class EthosUQuantizer(compile_spec: 'EthosUCompileSpec', use_composable_quantizer: 'bool' = False) -> 'None' +class EthosUQuantizer(compile_spec: 'EthosUCompileSpec', use_composable_quantizer: 'bool' = True) -> 'None' ``` Quantizer supported by the Arm Ethos-U backend. .. warning:: - Setting ``use_composable_quantizer=True`` enables an experimental API - surface that may change without notice. + The composable quantizer is now the default implementation. Setting + ``use_composable_quantizer=False`` is deprecated and will be removed in + two minor releases. Args: - **compile_spec (EthosUCompileSpec)**: Backend compile specification for Ethos-U targets. -- **use_composable_quantizer (bool)**: Whether to use the composable quantizer implementation. See https://github.com/pytorch/executorch/issues/17701" for details. +- **use_composable_quantizer (bool)**: Whether to use the composable + quantizer implementation. Setting this to ``False`` is deprecated + and will be removed in two minor releases. See + [issue #17701](https://github.com/pytorch/executorch/issues/17701) + for details. ```python def EthosUQuantizer.add_quantizer(self, quantizer: 'Quantizer') -> 'TOSAQuantizer': diff --git a/docs/source/backends/arm-vgf/arm-vgf-quantization.md b/docs/source/backends/arm-vgf/arm-vgf-quantization.md index 49ba41f74e1..2dc5b5631e6 100644 --- a/docs/source/backends/arm-vgf/arm-vgf-quantization.md +++ b/docs/source/backends/arm-vgf/arm-vgf-quantization.md @@ -35,18 +35,23 @@ setting using the `set_module_name` or `set_module_type` methods. ### Quantization API ```python -class VgfQuantizer(compile_spec: 'VgfCompileSpec', use_composable_quantizer: 'bool' = False) -> 'None' +class VgfQuantizer(compile_spec: 'VgfCompileSpec', use_composable_quantizer: 'bool' = True) -> 'None' ``` Quantizer supported by the Arm Vgf backend. .. warning:: - Setting ``use_composable_quantizer=True`` enables an experimental API - surface that may change without notice. + The composable quantizer is now the default implementation. Setting + ``use_composable_quantizer=False`` is deprecated and will be removed in + two minor releases. Args: - **compile_spec (VgfCompileSpec)**: Backend compile specification for Vgf targets. -- **use_composable_quantizer (bool)**: Whether to use the composable quantizer implementation. See https://github.com/pytorch/executorch/issues/17701" for details. +- **use_composable_quantizer (bool)**: Whether to use the composable + quantizer implementation. Setting this to ``False`` is deprecated + and will be removed in two minor releases. See + [issue #17701](https://github.com/pytorch/executorch/issues/17701) + for details. ```python def VgfQuantizer.add_quantizer(self, quantizer: 'Quantizer') -> 'TOSAQuantizer': diff --git a/examples/arm/quantizer_tutorial.ipynb b/examples/arm/quantizer_tutorial.ipynb index 76979316002..25b99dbd4b5 100644 --- a/examples/arm/quantizer_tutorial.ipynb +++ b/examples/arm/quantizer_tutorial.ipynb @@ -16,13 +16,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# WIP: TOSA/EthosU/VgfQuantizer composable quantizer tutorial\n", + "# TOSA/EthosU/VgfQuantizer composable quantizer tutorial\n", "\n", "This is an in-depth tutorial of the new `TOSA/EthosU/VgfQuantizer` API. While the `TOSAQuantizer` is used in the example, both the\n", "`EthosUQuantizer` and `VgfQuantizer` directly inherit from this base class. \n", "\n", - "Note that the main API and functionality remains largely the same to allow for a drop-in replacement, but the underlying framework is different - as will be explained. **Both the quantizer and this tutorial are currently experimental and may change without prior notice.** Refer to https://github.com/pytorch/executorch/issues/17701 for questions and feedback.\n", - "\n", "Before you begin:\n", "1. (In a clean virtual environment with a compatible Python version) Install executorch using `./install_executorch.sh`\n", "2. Install Arm TOSA dependencies using `examples/arm/setup.sh --disable-ethos-u-deps`\n", diff --git a/examples/models/llama/tests/test_export_llama_lib.py b/examples/models/llama/tests/test_export_llama_lib.py index f3dc403aa05..2e708479b4e 100644 --- a/examples/models/llama/tests/test_export_llama_lib.py +++ b/examples/models/llama/tests/test_export_llama_lib.py @@ -7,8 +7,6 @@ import unittest -import torch - from executorch.devtools.backend_debug import get_delegation_info try: @@ -117,8 +115,6 @@ def test_get_quantizer_and_quant_params_returns_vgf_quantizer(self): self.assertIsNone(quant_dtype) self.assertEqual(len(quantizers), 1) self.assertIsInstance(quantizers[0], VgfQuantizer) - self.assertIsNotNone(quantizers[0].global_config) - self.assertEqual(quantizers[0].module_type_config, {}) @unittest.skipUnless(HAS_ARM_BACKEND, "ARM backend not available") def test_get_quantizer_and_quant_params_returns_vgf_linear_quantizer(self): @@ -134,8 +130,6 @@ def test_get_quantizer_and_quant_params_returns_vgf_linear_quantizer(self): self.assertEqual(len(quantizers), 1) self.assertIsInstance(quantizers[0], VgfQuantizer) - self.assertIsNone(quantizers[0].global_config) - self.assertIn(torch.nn.Linear, quantizers[0].module_type_config) @unittest.skipUnless(HAS_ARM_BACKEND, "ARM backend not available") def test_vgf_16a8w_requires_int16_compile_spec_extension(self): @@ -162,4 +156,3 @@ def test_vgf_16a8w_accepts_int16_compile_spec_extension(self): self.assertEqual(len(quantizers), 1) self.assertIsInstance(quantizers[0], VgfQuantizer) - self.assertIn(torch.nn.Linear, quantizers[0].module_type_config)