From ece4a0b0b2a945225cac471407add249f19c70a1 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Fri, 16 Jan 2026 21:59:15 +0400 Subject: [PATCH 01/17] Extend quantizer to support compress_pt2e --- backends/openvino/quantizer/quantizer.py | 106 +++++++++++++++++++---- 1 file changed, 90 insertions(+), 16 deletions(-) diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py index 5766013689b..28042b5a5f3 100644 --- a/backends/openvino/quantizer/quantizer.py +++ b/backends/openvino/quantizer/quantizer.py @@ -13,7 +13,6 @@ import nncf # type: ignore[import-untyped] import nncf.common.quantization as quantization # type: ignore[import-untyped] import nncf.experimental.torch.fx as nncf_fx # type: ignore[import-untyped] - import torch.fx from executorch.backends.openvino.quantizer.observers import ( INT4WeightObserver, @@ -78,12 +77,12 @@ class OpenVINOQuantizer(Quantizer): optimally for the inference via OpenVINO. """ - WEIGHTS_ONLY_COMPRESSION_MODES = ( - QuantizationMode.INT4WO_SYM, - QuantizationMode.INT4WO_ASYM, - QuantizationMode.INT8WO_SYM, - QuantizationMode.INT8WO_ASYM, - ) + WEIGHTS_ONLY_COMPRESSION_MODES = { + QuantizationMode.INT4WO_SYM: "int4_sym", + QuantizationMode.INT4WO_ASYM: "int4_asym", + QuantizationMode.INT8WO_SYM: "int8_sym", + QuantizationMode.INT8WO_ASYM: "int8_asym", + } def __init__( self, @@ -116,17 +115,63 @@ def __init__( preset=preset, model_type=model_type, **kwargs ) else: - compression_mode = mode.value.replace( - "wo", "" - ) # Mode value has to match NNCF CompressWeightsMode + compression_mode = OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES[ + mode + ] # Mode value has to match NNCF CompressWeightsMode weight_compression_configuration = get_weight_compression_configuration( nncf.CompressWeightsMode(compression_mode), **kwargs, ) - subset_size = 1 # Doesn't really matter in this case since it is data-free. Should just be +ve + weight_compression_configuration["subset_size"] = ( + 1 # Doesn't really matter in this case since it is data-free. Should just be +ve + ) + self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression( - subset_size=subset_size, **weight_compression_configuration + **weight_compression_configuration + ) + + def _require_wc_algo( + self, + ) -> nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression: + if not isinstance( + self._algo, + nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression, + ): + raise TypeError( + "This method requires WeightCompression algo, but " + f"got {type(self._algo).__name__} (mode={self.mode})." ) + return self._algo + + def _require_ptq_algo(self) -> MinMaxQuantization: + if not isinstance(self._algo, MinMaxQuantization): + raise TypeError( + "This method requires MinMaxQuantization algo, but " + f"got {type(self._algo).__name__} (mode={self.mode})." + ) + return self._algo + + def get_weights_compression_config(self) -> Dict[str, Any]: + """ + Returns a dictionary with all_layers, group_size, backup_mode and Quantization mode parameters + used by the compress_pt2e weight compression algorithm. + + :return: A dictionary containing: + 1. mode: Quantization mode. One of INT4 Sym, INT4 Asym, INT8 Sym, INT8 Asym. + 2. group_size: group size to be used for group-wise compression. + 3. all_layers: Indicates whether embeddings and last MatMul layers should be compressed to a primary + precision. By default, the backup precision is assigned for the embeddings and last MatMul layers. + 4. backup_mode: Defines a backup mode for mixed-precision weight compression. + """ + algo = self._require_wc_algo() + quantizer_initialized_algo_attributes = { + "mode": algo.mode, + "group_size": algo.group_size, + "all_layers": algo.all_layers, + "backup_mode": algo.backup_mode, + } + + return quantizer_initialized_algo_attributes def set_ignored_scope( self, @@ -160,8 +205,32 @@ def set_ignored_scope( def get_nncf_quantization_setup( self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph ) -> quantization.quantizer_setup.SingleConfigQuantizerSetup: - self._algo._set_backend_entity(model) - return self._algo.find_quantization_setup(model, nncf_graph) + algo = self._require_ptq_algo() + algo._set_backend_entity(model) + return algo.find_quantization_setup(model, nncf_graph) + + def get_nncf_weight_compression_parameters( + self, + model: torch.fx.GraphModule, + nncf_graph: NNCFGraph, + ) -> Tuple[ + List[WeightCompressionParameters], + List[WeightCompressionParameters], + List[WeightCompressionParameters], + ]: + """ + Collect weight compression parameters for the given FX model and NNCF graph. + + :param model: FX GraphModule to analyze for weight compression. + :param nncf_graph: NNCFGraph representation of the model. + :return: A tuple of: + - all parameters eligible for weight compression, + - ratio-defining parameters used to set primary/backup precisions, + - parameters that are not compressible and remain in original precision. + """ + algo = self._require_wc_algo() + algo.set_backend_entity(model) + return algo.get_weight_compression_parameters(model, nncf_graph) def _annotate_weight_compression( self, @@ -182,12 +251,17 @@ def _annotate_weight_compression( :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations. :return: Updated mapping of FX nodes with weight compression annotations. """ - self._algo.set_backend_entity(model) - all_wc_params, _ = self._algo.get_weight_compression_parameters( + all_wc_params, *_ = self.get_nncf_weight_compression_parameters( model, nncf_graph ) for wc_param in all_wc_params: + if not wc_param.compression_config: + nncf_logger.debug( + "Skipping weight compression for node '%s' because compression_config is missing.", + getattr(wc_param.node_with_weight, "node_name", ""), + ) + continue node_with_weight = wc_param.node_with_weight target_node = nncf_fx.node_utils.get_graph_node_by_name( graph, node_with_weight.node_name From 9cc099161595a6d9c4fb8d07070291314a18ea3d Mon Sep 17 00:00:00 2001 From: anzr299 Date: Fri, 16 Jan 2026 22:00:09 +0400 Subject: [PATCH 02/17] integrate compress_pt2e into the example --- backends/openvino/quantization/__init__.py | 3 + .../openvino/quantization/nncf_compression.py | 79 ++++++++++++ backends/openvino/quantizer/__init__.py | 8 +- .../openvino/quantizer/llm_compression.py | 112 ++++++++++++++++++ backends/openvino/requirements.txt | 2 +- examples/models/llama/export_llama_lib.py | 30 ++++- extension/llm/export/config/llm_config.py | 11 +- 7 files changed, 235 insertions(+), 10 deletions(-) create mode 100644 backends/openvino/quantization/__init__.py create mode 100644 backends/openvino/quantization/nncf_compression.py create mode 100644 backends/openvino/quantizer/llm_compression.py diff --git a/backends/openvino/quantization/__init__.py b/backends/openvino/quantization/__init__.py new file mode 100644 index 00000000000..5bc3585a975 --- /dev/null +++ b/backends/openvino/quantization/__init__.py @@ -0,0 +1,3 @@ +from .nncf_compression import use_nncf_compression + +__all__ = ["use_nncf_compression"] \ No newline at end of file diff --git a/backends/openvino/quantization/nncf_compression.py b/backends/openvino/quantization/nncf_compression.py new file mode 100644 index 00000000000..937e756f904 --- /dev/null +++ b/backends/openvino/quantization/nncf_compression.py @@ -0,0 +1,79 @@ +# Copyright (c) Intel Corporation +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file found in the +# LICENSE file in the root directory of this source tree. + +# mypy: disable-error-code=import-not-found + +import torch + +try: + import nncf # type: ignore[import-untyped] + from pytorch_tokenizers import get_tokenizer # type: ignore[import-untyped] +except ImportError: + raise ImportError("Please install nncf via backends/openvino/requirements.txt") + + +def get_calibration_data( + module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int +): + # TODO: change criteria & support batch inputs if necessary + pos = torch.tensor(0, dtype=torch.int64) + token_list = tokenizer.encode(prompts, bos=True, eos=False) + + with torch.no_grad(): + while token_list[-1] != tokenizer.eos_id and pos < max_len: + logits = module( + torch.full((1, 1), token_list[pos]), + {"input_pos": torch.tensor((pos,))}, + ) + pos += 1 + if pos >= len(token_list): + token_list.append(torch.argmax(logits[:], dim=-1).item()) + token_list = [ + ( + pos, + token, + ) + for pos, token in enumerate(token_list) + ] + return token_list + + +def transform_fn(token_pos_map: tuple[int, str]): + # tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) + inputs = () + inputs = ( + torch.tensor(token_pos_map[1]).unsqueeze(0).unsqueeze(0), + {"input_pos": torch.tensor([token_pos_map[0]])}, + ) + + return inputs + + +def apply_nncf_data_aware_compression( + builder_exported, quantizers, awq: bool, scale_estimation: bool +): + tokenizer = get_tokenizer(builder_exported.tokenizer_path) + + builder_exported.calibration_data = get_calibration_data( + builder_exported.pre_autograd_graph_module, + tokenizer, + builder_exported.calibration_data, + builder_exported.max_seq_len, + ) + + builder_exported.pre_autograd_graph_module = ( + nncf.experimental.torch.fx.compress_pt2e( + builder_exported.pre_autograd_graph_module, + quantizer=quantizers[0], + dataset=nncf.Dataset( + builder_exported.calibration_data, + transform_func=transform_fn, + ), + awq=awq, + scale_estimation=scale_estimation, + ) + ) + return builder_exported \ No newline at end of file diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py index 5aae52ef3e8..e819aaf5159 100644 --- a/backends/openvino/quantizer/__init__.py +++ b/backends/openvino/quantizer/__init__.py @@ -1,3 +1,9 @@ +from .llm_compression import apply_nncf_data_aware_compression from .quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model -__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"] +__all__ = [ + "OpenVINOQuantizer", + "quantize_model", + "QuantizationMode", + "apply_nncf_data_aware_compression", +] diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py new file mode 100644 index 00000000000..04f29df4d03 --- /dev/null +++ b/backends/openvino/quantizer/llm_compression.py @@ -0,0 +1,112 @@ +# Copyright (c) Intel Corporation +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file found in the +# LICENSE file in the root directory of this source tree. + +# mypy: disable-error-code=import-not-found + +from typing import Callable, List, Optional, Tuple, Union + +import torch +from executorch.extension.llm.export.builder import LLMEdgeManager +from torchao.quantization.pt2e.quantizer import Quantizer + +try: + import nncf # type: ignore[import-untyped] + from pytorch_tokenizers import get_tokenizer # type: ignore[import-untyped] +except ImportError: + raise ImportError("Please install nncf via backends/openvino/requirements.txt") + + +# This code is taken from https://github.com/pytorch/executorch/blob/0c54fd0483314da173f8e14d63d2ed9591c7133a/extension/llm/export/builder.py#L278 +def get_calibration_data( + module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int +): + """ + This method is used to obtain calibration data from a prompt so that the algorithm + is calibrated not only with the dataset but also the inputs which are output by + the model. + Currently, this method is only tested with Llama models. + """ + # TODO: change criteria & support batch inputs if necessary + pos = torch.tensor(0, dtype=torch.int64) + token_list = tokenizer.encode(prompts, bos=True, eos=False) + + with torch.no_grad(): + while token_list[-1] != tokenizer.eos_id and pos < max_len: + logits = module( + torch.full((1, 1), token_list[pos]), + {"input_pos": torch.tensor((pos,))}, + ) + pos += 1 + if pos >= len(token_list): + token_list.append(torch.argmax(logits[:], dim=-1).item()) + token_list = [ + ( + pos, + token, + ) + for pos, token in enumerate(token_list) + ] + return token_list + + +def transform_fn(token_pos_map: Tuple[int, int]): + """ + Transforms and returns input from dataset so that it is acceptable by the model + Currently, this method is only tested with Llama models. + + :param token_pos_map: This input contains the position and its token ID + """ + inputs = ( + torch.tensor(token_pos_map[1]).unsqueeze(0).unsqueeze(0), + {"input_pos": torch.tensor([token_pos_map[0]])}, + ) + + return inputs + + +def apply_nncf_data_aware_compression( + builder_exported: LLMEdgeManager, + quantizer: Quantizer, + awq: bool, + scale_estimation: bool, +) -> LLMEdgeManager: + """ + Applies NNCF data-aware weight compression to the exported LLM graph. + Uses the builder's tokenizer and calibration prompt to generate token-level + calibration data, then runs `nncf.experimental.torch.fx.compress_pt2e` with + the given quantizer and optional AWQ / scale estimation enabled. + + :param builder_exported: LLMEdgeManager containing the FX graph, tokenizer path, + calibration prompt, and max sequence length. + :param quantizer: TorchAO quantizer to use for compression. + :param awq: If True, enables Activation-aware Weights Quantization (AWQ). + :param scale_estimation: If True, enables NNCF's scale estimation algorithm. + :return: The updated LLMEdgeManager with compressed torch FX model + """ + tokenizer = get_tokenizer(builder_exported.tokenizer_path) + + nncf_calibration_data = None + if awq or scale_estimation: + nncf_calibration_data = nncf.Dataset( + get_calibration_data( + builder_exported.pre_autograd_graph_module, + tokenizer, + builder_exported.calibration_data, + builder_exported.max_seq_len, + ), + transform_func=transform_fn, + ) + + builder_exported.pre_autograd_graph_module = ( + nncf.experimental.torch.fx.compress_pt2e( + builder_exported.pre_autograd_graph_module, + quantizer=quantizer, + dataset=nncf_calibration_data, + awq=awq, + scale_estimation=scale_estimation, + ) + ) + return builder_exported diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt index 519818d0aac..ff7a72318ec 100644 --- a/backends/openvino/requirements.txt +++ b/backends/openvino/requirements.txt @@ -1,2 +1,2 @@ transformers -git+https://github.com/openvinotoolkit/nncf@3d753ac#egg=nncf +git+https://github.com/openvinotoolkit/nncf@88445b32d6a622e177945c331beef95e222dbe00 \ No newline at end of file diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 219cc71ded1..87c87972d9a 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -16,14 +16,12 @@ import re import shlex from functools import partial - from importlib import resources as _resources from json import JSONDecodeError from pathlib import Path from typing import Callable, List, Optional, Union import torch - from executorch.devtools.backend_debug import print_delegation_info from executorch.devtools.etrecord import generate_etrecord as generate_etrecord_func from executorch.examples.models.llama.hf_download import ( @@ -241,6 +239,19 @@ def build_args_parser() -> argparse.ArgumentParser: help="Path to the adapter_config.json file. Used if the model has trained LoRA adapters. Must provide adapter_checkpoint.", ) + parser.add_argument( + "--openvino_awq", + required=False, + action="store_true", + help="Whether to use AWQ from NNCF. Applicable only for the OpenVINO backend.", + ) + + parser.add_argument( + "--openvino_scale_estimation", + action="store_true", + help="Whether to use Scale Estimation algorithm from NNCF. Applicable only for the OpenVINO backend", + ) + parser.add_argument( "--use_qnn_sha", action="store_true", @@ -775,7 +786,7 @@ def get_quantizer_and_quant_params(llm_config): ) quantizers.append(qnn_quantizer) if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize: - assert not quantizers, "Should not enable both xnnpack and openvino" + assert not quantizers, "Should not enable openvino and other quantizers" group_size = llm_config.quantization.group_size group_size = group_size if group_size else 128 ov_quantizer = get_ov_quantizer( @@ -907,6 +918,8 @@ def _to_edge_and_lower_llama_openvino( modelname, quantizers, additional_passes, + awq, + scale_estimation, openvino_device: str = "CPU", verbose: bool = False, ) -> LLMEdgeManager: # noqa: C901 @@ -920,10 +933,15 @@ def _to_edge_and_lower_llama_openvino( for partitioner in partitioners: logging.info(f"--> {partitioner.__class__.__name__}") - builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower( - partitioners + from executorch.backends.openvino.quantizer import apply_nncf_data_aware_compression + + logging.info(f"Applying AWQ = {awq}, Scale Estimation = {scale_estimation}") + builder = apply_nncf_data_aware_compression( + builder_exported, quantizers[0], awq, scale_estimation ) + builder = builder.to_edge_transform_and_lower(partitioners) + if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) @@ -1159,6 +1177,8 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 modelname, quantizers, additional_passes, + awq=llm_config.backend.openvino.openvino_awq, + scale_estimation=llm_config.backend.openvino.openvino_scale_estimation, openvino_device=llm_config.backend.openvino.device, verbose=llm_config.debug.verbose, ) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index b40fad88a9c..3280e674dec 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -460,8 +460,9 @@ class OpenvinoConfig: enabled: bool = False device: str = "CPU" - nncf_compression: bool = False nncf_compression_group_size: int = 32 + openvino_awq: bool = False + openvino_scale_estimation: bool = False @dataclass @@ -659,8 +660,12 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 llm_config.backend.openvino.enabled = args.openvino if hasattr(args, "openvino_device"): llm_config.backend.openvino.device = args.openvino_device - if hasattr(args, "nncf_compression"): - llm_config.backend.openvino.nncf_compression = args.nncf_compression + if hasattr(args, "openvino_awq"): + llm_config.backend.openvino.openvino_awq = args.openvino_awq + if hasattr(args, "openvino_scale_estimation"): + llm_config.backend.openvino.openvino_scale_estimation = ( + args.openvino_scale_estimation + ) if hasattr(args, "group_size") and args.group_size: llm_config.backend.openvino.nncf_compression_group_size = args.group_size From 6c0d7663c70462950cc4d1eb558b5fffb2c527a3 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 19 Jan 2026 20:39:58 +0400 Subject: [PATCH 03/17] remove extra directories --- backends/openvino/quantization/__init__.py | 3 - .../openvino/quantization/nncf_compression.py | 79 ------------------- 2 files changed, 82 deletions(-) delete mode 100644 backends/openvino/quantization/__init__.py delete mode 100644 backends/openvino/quantization/nncf_compression.py diff --git a/backends/openvino/quantization/__init__.py b/backends/openvino/quantization/__init__.py deleted file mode 100644 index 5bc3585a975..00000000000 --- a/backends/openvino/quantization/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .nncf_compression import use_nncf_compression - -__all__ = ["use_nncf_compression"] \ No newline at end of file diff --git a/backends/openvino/quantization/nncf_compression.py b/backends/openvino/quantization/nncf_compression.py deleted file mode 100644 index 937e756f904..00000000000 --- a/backends/openvino/quantization/nncf_compression.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) Intel Corporation -# -# Licensed under the BSD License (the "License"); you may not use this file -# except in compliance with the License. See the license file found in the -# LICENSE file in the root directory of this source tree. - -# mypy: disable-error-code=import-not-found - -import torch - -try: - import nncf # type: ignore[import-untyped] - from pytorch_tokenizers import get_tokenizer # type: ignore[import-untyped] -except ImportError: - raise ImportError("Please install nncf via backends/openvino/requirements.txt") - - -def get_calibration_data( - module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int -): - # TODO: change criteria & support batch inputs if necessary - pos = torch.tensor(0, dtype=torch.int64) - token_list = tokenizer.encode(prompts, bos=True, eos=False) - - with torch.no_grad(): - while token_list[-1] != tokenizer.eos_id and pos < max_len: - logits = module( - torch.full((1, 1), token_list[pos]), - {"input_pos": torch.tensor((pos,))}, - ) - pos += 1 - if pos >= len(token_list): - token_list.append(torch.argmax(logits[:], dim=-1).item()) - token_list = [ - ( - pos, - token, - ) - for pos, token in enumerate(token_list) - ] - return token_list - - -def transform_fn(token_pos_map: tuple[int, str]): - # tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) - inputs = () - inputs = ( - torch.tensor(token_pos_map[1]).unsqueeze(0).unsqueeze(0), - {"input_pos": torch.tensor([token_pos_map[0]])}, - ) - - return inputs - - -def apply_nncf_data_aware_compression( - builder_exported, quantizers, awq: bool, scale_estimation: bool -): - tokenizer = get_tokenizer(builder_exported.tokenizer_path) - - builder_exported.calibration_data = get_calibration_data( - builder_exported.pre_autograd_graph_module, - tokenizer, - builder_exported.calibration_data, - builder_exported.max_seq_len, - ) - - builder_exported.pre_autograd_graph_module = ( - nncf.experimental.torch.fx.compress_pt2e( - builder_exported.pre_autograd_graph_module, - quantizer=quantizers[0], - dataset=nncf.Dataset( - builder_exported.calibration_data, - transform_func=transform_fn, - ), - awq=awq, - scale_estimation=scale_estimation, - ) - ) - return builder_exported \ No newline at end of file From f9c782b3ef3a334a0b25fa91fc72681d672ebdfe Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 21 Jan 2026 20:12:42 +0400 Subject: [PATCH 04/17] review changes --- backends/openvino/quantizer/llm_compression.py | 12 ++++++++---- backends/openvino/requirements.txt | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py index 04f29df4d03..fca93b74831 100644 --- a/backends/openvino/quantizer/llm_compression.py +++ b/backends/openvino/quantizer/llm_compression.py @@ -86,16 +86,20 @@ def apply_nncf_data_aware_compression( :param scale_estimation: If True, enables NNCF's scale estimation algorithm. :return: The updated LLMEdgeManager with compressed torch FX model """ - tokenizer = get_tokenizer(builder_exported.tokenizer_path) - nncf_calibration_data = None - if awq or scale_estimation: + if ( + builder_exported.calibration_seq_length is not None + and builder_exported.calibration_data is not None + and builder_exported.tokenizer_path is not None + and (awq or scale_estimation) + ): + tokenizer = get_tokenizer(builder_exported.tokenizer_path) nncf_calibration_data = nncf.Dataset( get_calibration_data( builder_exported.pre_autograd_graph_module, tokenizer, builder_exported.calibration_data, - builder_exported.max_seq_len, + builder_exported.calibration_seq_length, ), transform_func=transform_fn, ) diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt index ff7a72318ec..208f11c4388 100644 --- a/backends/openvino/requirements.txt +++ b/backends/openvino/requirements.txt @@ -1,2 +1,2 @@ transformers -git+https://github.com/openvinotoolkit/nncf@88445b32d6a622e177945c331beef95e222dbe00 \ No newline at end of file +git+https://github.com/openvinotoolkit/nncf@88445b32d6a622e177945c331beef95e222dbe00 From 24f684fb37d249e42995319978075890d9ebbb2a Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 21 Jan 2026 20:52:08 +0400 Subject: [PATCH 05/17] lint --- backends/openvino/quantizer/llm_compression.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py index fca93b74831..96c185e38ca 100644 --- a/backends/openvino/quantizer/llm_compression.py +++ b/backends/openvino/quantizer/llm_compression.py @@ -88,11 +88,11 @@ def apply_nncf_data_aware_compression( """ nncf_calibration_data = None if ( - builder_exported.calibration_seq_length is not None - and builder_exported.calibration_data is not None - and builder_exported.tokenizer_path is not None - and (awq or scale_estimation) - ): + builder_exported.calibration_seq_length is not None + and builder_exported.calibration_data is not None + and builder_exported.tokenizer_path is not None + and (awq or scale_estimation) + ): tokenizer = get_tokenizer(builder_exported.tokenizer_path) nncf_calibration_data = nncf.Dataset( get_calibration_data( From 0963b73e9cf479f9da1c280836ebc8581de55a0b Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 21 Jan 2026 23:38:08 +0400 Subject: [PATCH 06/17] add unit test --- backends/openvino/tests/README.md | 3 + .../tests/quantizer/synthetic_test_models.py | 21 +++ .../tests/quantizer/test_llm_compression.py | 136 ++++++++++++++++++ backends/openvino/tests/test_runner.py | 8 +- 4 files changed, 164 insertions(+), 4 deletions(-) create mode 100644 backends/openvino/tests/quantizer/synthetic_test_models.py create mode 100644 backends/openvino/tests/quantizer/test_llm_compression.py diff --git a/backends/openvino/tests/README.md b/backends/openvino/tests/README.md index 0aad14e04a0..b5624a8ca15 100644 --- a/backends/openvino/tests/README.md +++ b/backends/openvino/tests/README.md @@ -11,6 +11,8 @@ backends/openvino/tests └── test_.py # Individual op tests scripts. ├── models # Directory with model test scripts. └── test_classification.py # Test script for classification models. +├── quantizer # Directory with quantizer test scripts. + └── test_llm_compression.py # Test script for llm compression using NNCF algorithms. ├── README.md # Documentation for unit tests (this file) └── test_runner.py # Script to execute unit tests. ``` @@ -31,6 +33,7 @@ Before you begin, refer to instructions provided in [OpenVINO Backend for ExecuT Supported values: - `ops` (default) - `models` + - `quantizer` - **`--pattern`** (optional): Pattern to match test files. Provide complete file name to run individual tests. The default value is `test_*.py` diff --git a/backends/openvino/tests/quantizer/synthetic_test_models.py b/backends/openvino/tests/quantizer/synthetic_test_models.py new file mode 100644 index 00000000000..6bdc3650a41 --- /dev/null +++ b/backends/openvino/tests/quantizer/synthetic_test_models.py @@ -0,0 +1,21 @@ +import torch + +class SimpleTransformer(torch.nn.Module): + def __init__(self, vocab_size=100, hidden_size=64, num_layers=2): + super().__init__() + self.embed = torch.nn.Embedding(vocab_size, hidden_size) + self.layers = torch.nn.ModuleList([ + torch.nn.Linear(hidden_size, hidden_size) for _ in range(num_layers) + ]) + self.lm_head = torch.nn.Linear(hidden_size, vocab_size) + self.vocab_size = vocab_size + + def forward(self, tokens, input_pos): + x = self.embed(tokens) + + for layer in self.layers: + x = torch.relu(layer(x)) + + logits = self.lm_head(x) + + return logits diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py new file mode 100644 index 00000000000..a1ce2c8d2b8 --- /dev/null +++ b/backends/openvino/tests/quantizer/test_llm_compression.py @@ -0,0 +1,136 @@ +import unittest +from unittest.mock import patch, Mock +import torch +from executorch.extension.llm.export.builder import LLMEdgeManager + +from executorch.backends.openvino.quantizer.llm_compression import ( + apply_nncf_data_aware_compression, + get_calibration_data, +) +from executorch.backends.openvino.quantizer import ( + OpenVINOQuantizer, + QuantizationMode, +) +from synthetic_test_models import SimpleTransformer + +class TestWeightsOnlyQuantization(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model_name = "llama" + cls.model_class_name = "Llama2Model" + cls.model = SimpleTransformer() + cls.model.eval() + + cls.max_seq_len = 128 + cls.example_inputs = (torch.tensor([[1]], dtype=torch.long), {"input_pos": torch.tensor([0], dtype=torch.long)}) + + cls.compression_configs = [ + { + "name": "awq_only", + "awq": True, + "scale_estimation": False, + }, + { + "name": "scale_estimation_only", + "awq": False, + "scale_estimation": True, + }, + { + "name": "awq_and_scale_estimation", + "awq": True, + "scale_estimation": True, + }, + { + "name": "no_calibration", + "awq": False, + "scale_estimation": False, + }, + ] + + cls.calibration_data = "The quick brown fox jumps over the lazy dog." + + def _create_builder(self, config_name, calibration_data=None): + builder_kwargs = { + "model": self.model, + "modelname": f"tinyllama_{config_name}", + "max_seq_len": self.max_seq_len, + "use_kv_cache": True, + "example_inputs": self.example_inputs, + "example_kwarg_inputs": None, + } + + if calibration_data: + builder_kwargs.update({ + "calibration_seq_length": 32, + "calibration_data": calibration_data, + "tokenizer_path": "dummy_path", # Will be mocked + }) + + return LLMEdgeManager(**builder_kwargs) + + + @patch('executorch.backends.openvino.quantizer.llm_compression.get_tokenizer') + @patch('executorch.backends.openvino.quantizer.llm_compression.get_calibration_data') + def test_compression_flow_with_mocked_calibration( + self, mock_get_calibration_data, mock_get_tokenizer + ): + mock_calibration_data = [ + (0, 1), (1, 5), (2, 10), (3, 15), (4, 20), + (5, 25), (6, 30), (7, 35), (8, 40), (9, 45) + ] + mock_get_calibration_data.return_value = mock_calibration_data + + mock_tokenizer = Mock() + mock_get_tokenizer.return_value = mock_tokenizer + + for config in self.compression_configs: + with self.subTest(phase="compression_config", config=config["name"]): + calibration_data = self.calibration_data if config["awq"] or config["scale_estimation"] else None + + builder = self._create_builder( + config["name"], + calibration_data=calibration_data + ) + builder.export() + import copy + original_model = copy.deepcopy(builder.pre_autograd_graph_module) + + test_input = torch.tensor([[5]], dtype=torch.long) + test_pos = torch.tensor([0], dtype=torch.long) + reference_output = original_model(test_input, {"input_pos": test_pos}) + + quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT4WO_SYM, group_size=-1) + builder = apply_nncf_data_aware_compression( + builder, + quantizer=quantizer, + awq=config["awq"], + scale_estimation=config["scale_estimation"], + ) + + compressed_output = builder.pre_autograd_graph_module(test_input, {"input_pos": test_pos}) + + torch.allclose(compressed_output, reference_output) + + +class TestCalibrationDataGeneration(unittest.TestCase): + """Test the calibration data generation method. We first create a mock tokenizer + and then compare it with a reference created manually""" + + def test_get_calibration_data_with_mock_module(self): + # Create mock tokenizer + mock_tokenizer = Mock() + mock_tokenizer.eos_id = 2 + mock_tokenizer.encode = Mock(return_value=[1, 5, 6]) + + mock_module = Mock() + mock_module.return_value = torch.tensor([[[0.1, 0.2, 0.9, 0.0]]]) + + result = get_calibration_data( + mock_module, + mock_tokenizer, + "test prompt", # Will be mocked + max_len=10 + ) + + positions = [item[0] for item in result] + self.assertEqual(positions, list(range(len(positions)))) diff --git a/backends/openvino/tests/test_runner.py b/backends/openvino/tests/test_runner.py index 021c372db25..bf744debd14 100644 --- a/backends/openvino/tests/test_runner.py +++ b/backends/openvino/tests/test_runner.py @@ -44,10 +44,10 @@ def parse_arguments(): parser.add_argument( "-t", "--test_type", - help="Specify the type of tests ('ops' or 'models')", + help="Specify the type of tests ('ops', 'models' or 'quantizer')", type=str, default="ops", - choices={"ops", "models"}, + choices={"ops", "models", "quantizer"}, ) args, ns_args = parser.parse_known_args(namespace=unittest) @@ -68,8 +68,8 @@ def parse_arguments(): # Discover all existing op tests in "ops" folder suite = loader.discover(test_params["test_type"], pattern=test_params["pattern"]) # Start running tests - with nncf.torch.disable_patching(): - result = unittest.TextTestRunner().run(suite) + # with nncf.torch.disable_patching(): + result = unittest.TextTestRunner().run(suite) if result.wasSuccessful(): print("OpenVINO backend tests completed successfully") else: From 792caf21008fb24a38f908eca8dea7adb5097df2 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 21 Jan 2026 23:38:26 +0400 Subject: [PATCH 07/17] add some corner case checks in llm compression --- backends/openvino/quantizer/llm_compression.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py index 96c185e38ca..bb6287721a3 100644 --- a/backends/openvino/quantizer/llm_compression.py +++ b/backends/openvino/quantizer/llm_compression.py @@ -60,7 +60,7 @@ def transform_fn(token_pos_map: Tuple[int, int]): :param token_pos_map: This input contains the position and its token ID """ inputs = ( - torch.tensor(token_pos_map[1]).unsqueeze(0).unsqueeze(0), + torch.tensor([[token_pos_map[1]]]), {"input_pos": torch.tensor([token_pos_map[0]])}, ) @@ -104,6 +104,11 @@ def apply_nncf_data_aware_compression( transform_func=transform_fn, ) + # AWQ can work without a dataset as well. + if scale_estimation and not nncf_calibration_data: + msg = "Scale Estimation is enabled but no calibration dataset is provided" + raise RuntimeError(msg) + builder_exported.pre_autograd_graph_module = ( nncf.experimental.torch.fx.compress_pt2e( builder_exported.pre_autograd_graph_module, From dc3b2191494eae590c4e4c1b9d6902d61ff7dc76 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 21 Jan 2026 23:50:16 +0400 Subject: [PATCH 08/17] clean unused imports --- backends/openvino/quantizer/llm_compression.py | 2 +- backends/openvino/tests/quantizer/test_llm_compression.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py index bb6287721a3..d077e53fd96 100644 --- a/backends/openvino/quantizer/llm_compression.py +++ b/backends/openvino/quantizer/llm_compression.py @@ -6,7 +6,7 @@ # mypy: disable-error-code=import-not-found -from typing import Callable, List, Optional, Tuple, Union +from typing import Tuple import torch from executorch.extension.llm.export.builder import LLMEdgeManager diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py index a1ce2c8d2b8..84088248562 100644 --- a/backends/openvino/tests/quantizer/test_llm_compression.py +++ b/backends/openvino/tests/quantizer/test_llm_compression.py @@ -117,7 +117,6 @@ class TestCalibrationDataGeneration(unittest.TestCase): and then compare it with a reference created manually""" def test_get_calibration_data_with_mock_module(self): - # Create mock tokenizer mock_tokenizer = Mock() mock_tokenizer.eos_id = 2 mock_tokenizer.encode = Mock(return_value=[1, 5, 6]) From 0d3d68160b1584fd7e12d6a88670120b2916a775 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Thu, 22 Jan 2026 00:06:01 +0400 Subject: [PATCH 09/17] lint --- .../tests/quantizer/synthetic_test_models.py | 17 ++-- .../tests/quantizer/test_llm_compression.py | 88 +++++++++++-------- 2 files changed, 62 insertions(+), 43 deletions(-) diff --git a/backends/openvino/tests/quantizer/synthetic_test_models.py b/backends/openvino/tests/quantizer/synthetic_test_models.py index 6bdc3650a41..9e665fbdb3c 100644 --- a/backends/openvino/tests/quantizer/synthetic_test_models.py +++ b/backends/openvino/tests/quantizer/synthetic_test_models.py @@ -1,21 +1,22 @@ import torch -class SimpleTransformer(torch.nn.Module): + +class SimpleTransformer(torch.nn.Module): def __init__(self, vocab_size=100, hidden_size=64, num_layers=2): super().__init__() self.embed = torch.nn.Embedding(vocab_size, hidden_size) - self.layers = torch.nn.ModuleList([ - torch.nn.Linear(hidden_size, hidden_size) for _ in range(num_layers) - ]) + self.layers = torch.nn.ModuleList( + [torch.nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)] + ) self.lm_head = torch.nn.Linear(hidden_size, vocab_size) self.vocab_size = vocab_size - + def forward(self, tokens, input_pos): x = self.embed(tokens) - + for layer in self.layers: x = torch.relu(layer(x)) - + logits = self.lm_head(x) - + return logits diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py index 84088248562..6d4fbfb2492 100644 --- a/backends/openvino/tests/quantizer/test_llm_compression.py +++ b/backends/openvino/tests/quantizer/test_llm_compression.py @@ -1,17 +1,16 @@ import unittest -from unittest.mock import patch, Mock +from unittest.mock import Mock, patch + import torch -from executorch.extension.llm.export.builder import LLMEdgeManager +from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode from executorch.backends.openvino.quantizer.llm_compression import ( apply_nncf_data_aware_compression, get_calibration_data, ) -from executorch.backends.openvino.quantizer import ( - OpenVINOQuantizer, - QuantizationMode, -) -from synthetic_test_models import SimpleTransformer +from executorch.extension.llm.export.builder import LLMEdgeManager +from synthetic_test_models import SimpleTransformer # type: ignore[import-not-found] + class TestWeightsOnlyQuantization(unittest.TestCase): @classmethod @@ -22,8 +21,11 @@ def setUpClass(cls): cls.model.eval() cls.max_seq_len = 128 - cls.example_inputs = (torch.tensor([[1]], dtype=torch.long), {"input_pos": torch.tensor([0], dtype=torch.long)}) - + cls.example_inputs = ( + torch.tensor([[1]], dtype=torch.long), + {"input_pos": torch.tensor([0], dtype=torch.long)}, + ) + cls.compression_configs = [ { "name": "awq_only", @@ -46,7 +48,7 @@ def setUpClass(cls): "scale_estimation": False, }, ] - + cls.calibration_data = "The quick brown fox jumps over the lazy dog." def _create_builder(self, config_name, calibration_data=None): @@ -60,54 +62,73 @@ def _create_builder(self, config_name, calibration_data=None): } if calibration_data: - builder_kwargs.update({ - "calibration_seq_length": 32, - "calibration_data": calibration_data, - "tokenizer_path": "dummy_path", # Will be mocked - }) - - return LLMEdgeManager(**builder_kwargs) + builder_kwargs.update( + { + "calibration_seq_length": 32, + "calibration_data": calibration_data, + "tokenizer_path": "dummy_path", # Will be mocked + } + ) + return LLMEdgeManager(**builder_kwargs) - @patch('executorch.backends.openvino.quantizer.llm_compression.get_tokenizer') - @patch('executorch.backends.openvino.quantizer.llm_compression.get_calibration_data') + @patch("executorch.backends.openvino.quantizer.llm_compression.get_tokenizer") + @patch( + "executorch.backends.openvino.quantizer.llm_compression.get_calibration_data" + ) def test_compression_flow_with_mocked_calibration( self, mock_get_calibration_data, mock_get_tokenizer ): mock_calibration_data = [ - (0, 1), (1, 5), (2, 10), (3, 15), (4, 20), - (5, 25), (6, 30), (7, 35), (8, 40), (9, 45) + (0, 1), + (1, 5), + (2, 10), + (3, 15), + (4, 20), + (5, 25), + (6, 30), + (7, 35), + (8, 40), + (9, 45), ] mock_get_calibration_data.return_value = mock_calibration_data - + mock_tokenizer = Mock() mock_get_tokenizer.return_value = mock_tokenizer - + for config in self.compression_configs: with self.subTest(phase="compression_config", config=config["name"]): - calibration_data = self.calibration_data if config["awq"] or config["scale_estimation"] else None + calibration_data = ( + self.calibration_data + if config["awq"] or config["scale_estimation"] + else None + ) builder = self._create_builder( - config["name"], - calibration_data=calibration_data + config["name"], calibration_data=calibration_data ) builder.export() import copy + original_model = copy.deepcopy(builder.pre_autograd_graph_module) test_input = torch.tensor([[5]], dtype=torch.long) test_pos = torch.tensor([0], dtype=torch.long) reference_output = original_model(test_input, {"input_pos": test_pos}) - quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT4WO_SYM, group_size=-1) + quantizer = OpenVINOQuantizer( + mode=QuantizationMode.INT4WO_SYM, group_size=-1 + ) builder = apply_nncf_data_aware_compression( builder, quantizer=quantizer, awq=config["awq"], scale_estimation=config["scale_estimation"], ) - - compressed_output = builder.pre_autograd_graph_module(test_input, {"input_pos": test_pos}) + + compressed_output = builder.pre_autograd_graph_module( + test_input, {"input_pos": test_pos} + ) torch.allclose(compressed_output, reference_output) @@ -120,15 +141,12 @@ def test_get_calibration_data_with_mock_module(self): mock_tokenizer = Mock() mock_tokenizer.eos_id = 2 mock_tokenizer.encode = Mock(return_value=[1, 5, 6]) - + mock_module = Mock() mock_module.return_value = torch.tensor([[[0.1, 0.2, 0.9, 0.0]]]) - + result = get_calibration_data( - mock_module, - mock_tokenizer, - "test prompt", # Will be mocked - max_len=10 + mock_module, mock_tokenizer, "test prompt", max_len=10 # Will be mocked ) positions = [item[0] for item in result] From 12efc7000dee1844740ac3eccf3261b7452f8399 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Thu, 22 Jan 2026 16:36:01 +0400 Subject: [PATCH 10/17] review changes --- .../openvino/quantizer/llm_compression.py | 24 ++++++++++++++----- backends/openvino/tests/test_runner.py | 3 --- examples/models/llama/export_llama_lib.py | 1 - 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py index d077e53fd96..a7ca5ca8f09 100644 --- a/backends/openvino/quantizer/llm_compression.py +++ b/backends/openvino/quantizer/llm_compression.py @@ -19,7 +19,7 @@ raise ImportError("Please install nncf via backends/openvino/requirements.txt") -# This code is taken from https://github.com/pytorch/executorch/blob/0c54fd0483314da173f8e14d63d2ed9591c7133a/extension/llm/export/builder.py#L278 +# This code is adapted from https://github.com/pytorch/executorch/blob/0c54fd0483314da173f8e14d63d2ed9591c7133a/extension/llm/export/builder.py#L278 def get_calibration_data( module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int ): @@ -30,7 +30,7 @@ def get_calibration_data( Currently, this method is only tested with Llama models. """ # TODO: change criteria & support batch inputs if necessary - pos = torch.tensor(0, dtype=torch.int64) + pos = 0 token_list = tokenizer.encode(prompts, bos=True, eos=False) with torch.no_grad(): @@ -44,7 +44,7 @@ def get_calibration_data( token_list.append(torch.argmax(logits[:], dim=-1).item()) token_list = [ ( - pos, + torch.tensor(pos, dtype=torch.int64), token, ) for pos, token in enumerate(token_list) @@ -95,7 +95,7 @@ def apply_nncf_data_aware_compression( ): tokenizer = get_tokenizer(builder_exported.tokenizer_path) nncf_calibration_data = nncf.Dataset( - get_calibration_data( + get_calibration_data( # type: ignore[arg-type] builder_exported.pre_autograd_graph_module, tokenizer, builder_exported.calibration_data, @@ -106,8 +106,20 @@ def apply_nncf_data_aware_compression( # AWQ can work without a dataset as well. if scale_estimation and not nncf_calibration_data: - msg = "Scale Estimation is enabled but no calibration dataset is provided" - raise RuntimeError(msg) + missing_params = [] + if builder_exported.calibration_data is None: + missing_params.append("calibration_data") + if builder_exported.calibration_seq_length is None: + missing_params.append("calibration_seq_length") + if builder_exported.tokenizer_path is None: + missing_params.append("tokenizer_path") + msg = "Scale Estimation is enabled but no calibration dataset is provided." + if missing_params: + msg += ( + " Missing required calibration parameter(s): " + + ", ".join(missing_params) + + ". Please provide calibration_data, calibration_seq_length, and tokenizer_path." + ) builder_exported.pre_autograd_graph_module = ( nncf.experimental.torch.fx.compress_pt2e( diff --git a/backends/openvino/tests/test_runner.py b/backends/openvino/tests/test_runner.py index bf744debd14..7d8c6b968c2 100644 --- a/backends/openvino/tests/test_runner.py +++ b/backends/openvino/tests/test_runner.py @@ -1,8 +1,6 @@ import argparse import unittest -import nncf.torch # type: ignore[import-untyped,import-not-found] - class OpenvinoTestSuite(unittest.TestSuite): @@ -68,7 +66,6 @@ def parse_arguments(): # Discover all existing op tests in "ops" folder suite = loader.discover(test_params["test_type"], pattern=test_params["pattern"]) # Start running tests - # with nncf.torch.disable_patching(): result = unittest.TextTestRunner().run(suite) if result.wasSuccessful(): print("OpenVINO backend tests completed successfully") diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 87c87972d9a..0b8440440a1 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -241,7 +241,6 @@ def build_args_parser() -> argparse.ArgumentParser: parser.add_argument( "--openvino_awq", - required=False, action="store_true", help="Whether to use AWQ from NNCF. Applicable only for the OpenVINO backend.", ) From 1236dfcd6b1cef88ebe4e5dc4a63aa2f8ea98bc6 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Thu, 22 Jan 2026 16:36:27 +0400 Subject: [PATCH 11/17] comprae reference scale values in tests --- .../tests/quantizer/synthetic_test_models.py | 4 +- .../tests/quantizer/test_llm_compression.py | 136 +++++++++++++----- 2 files changed, 106 insertions(+), 34 deletions(-) diff --git a/backends/openvino/tests/quantizer/synthetic_test_models.py b/backends/openvino/tests/quantizer/synthetic_test_models.py index 9e665fbdb3c..6c7e91c5539 100644 --- a/backends/openvino/tests/quantizer/synthetic_test_models.py +++ b/backends/openvino/tests/quantizer/synthetic_test_models.py @@ -1,8 +1,8 @@ import torch -class SimpleTransformer(torch.nn.Module): - def __init__(self, vocab_size=100, hidden_size=64, num_layers=2): +class ExportLlamaTestModel(torch.nn.Module): + def __init__(self, vocab_size=5, hidden_size=2, num_layers=1): super().__init__() self.embed = torch.nn.Embedding(vocab_size, hidden_size) self.layers = torch.nn.ModuleList( diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py index 6d4fbfb2492..252f9cd6875 100644 --- a/backends/openvino/tests/quantizer/test_llm_compression.py +++ b/backends/openvino/tests/quantizer/test_llm_compression.py @@ -7,17 +7,17 @@ from executorch.backends.openvino.quantizer.llm_compression import ( apply_nncf_data_aware_compression, get_calibration_data, + transform_fn, ) from executorch.extension.llm.export.builder import LLMEdgeManager -from synthetic_test_models import SimpleTransformer # type: ignore[import-not-found] +from synthetic_test_models import ExportLlamaTestModel # type: ignore[import-not-found] class TestWeightsOnlyQuantization(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model_name = "llama" - cls.model_class_name = "Llama2Model" - cls.model = SimpleTransformer() + torch.manual_seed(42) + cls.model = ExportLlamaTestModel(vocab_size=5, hidden_size=2, num_layers=1) cls.model.eval() cls.max_seq_len = 128 @@ -51,6 +51,61 @@ def setUpClass(cls): cls.calibration_data = "The quick brown fox jumps over the lazy dog." + cls.reference_scales = { + "awq_only": { + "symmetric_weights_decompressor_embed_weight._scale": torch.tensor( + [[-0.042084], [-0.029312], [0.140381], [-0.276123], [-0.057709]], + dtype=torch.float16, + ), + "symmetric_weights_decompressor_layers_0_weight._scale": torch.tensor( + [[0.040710], [-0.058624]], dtype=torch.float16 + ), + "relu/awq_mul._scale_value": torch.tensor([[[1.0, 1.0]]]), + "symmetric_weights_decompressor_lm_head_weight_updated_constant0._scale": torch.tensor( + [[0.053131], [0.087280], [-0.079834], [-0.068237], [-0.054626]], + dtype=torch.float16, + ), + }, + "scale_estimation_only": { + "symmetric_weights_decompressor_embed_weight._scale": torch.tensor( + [[-0.042084], [-0.029312], [0.140381], [-0.276123], [-0.057709]], + dtype=torch.float16, + ), + "symmetric_weights_decompressor_layers_0_weight._scale": torch.tensor( + [[0.040710], [-0.057709]], dtype=torch.float16 + ), + "symmetric_weights_decompressor_lm_head_weight._scale": torch.tensor( + [[0.0], [0.0], [-0.0], [-0.0], [-0.0]], dtype=torch.float16 + ), + }, + "awq_and_scale_estimation": { + "symmetric_weights_decompressor_embed_weight._scale": torch.tensor( + [[-0.042084], [-0.029312], [0.140381], [-0.276123], [-0.057709]], + dtype=torch.float16, + ), + "symmetric_weights_decompressor_layers_0_weight._scale": torch.tensor( + [[0.040710], [-0.057709]], dtype=torch.float16 + ), + "relu/awq_mul._scale_value": torch.tensor([[[1.0, 1.0]]]), + "symmetric_weights_decompressor_lm_head_weight_updated_constant0._scale": torch.tensor( + [[0.0], [0.0], [-0.0], [-0.0], [-0.0]], dtype=torch.float16 + ), + }, + "no_calibration": { + "symmetric_weights_decompressor_embed_weight._scale": torch.tensor( + [[-0.042084], [-0.029312], [0.140381], [-0.276123], [-0.057709]], + dtype=torch.float16, + ), + "symmetric_weights_decompressor_layers_0_weight._scale": torch.tensor( + [[0.040710], [-0.058624]], dtype=torch.float16 + ), + "symmetric_weights_decompressor_lm_head_weight._scale": torch.tensor( + [[0.053131], [0.087280], [-0.079834], [-0.068237], [-0.054626]], + dtype=torch.float16, + ), + }, + } + def _create_builder(self, config_name, calibration_data=None): builder_kwargs = { "model": self.model, @@ -66,12 +121,29 @@ def _create_builder(self, config_name, calibration_data=None): { "calibration_seq_length": 32, "calibration_data": calibration_data, - "tokenizer_path": "dummy_path", # Will be mocked + "tokenizer_path": "dummy_path", } ) return LLMEdgeManager(**builder_kwargs) + def _extract_scales_from_model(self, model): + extracted_scales = {} + state_dict = dict(model.state_dict()) + for name, _ in state_dict.items(): + if "_scale" in name.lower(): + extracted_scales[name] = state_dict[name] + return extracted_scales + + def _compare_scales(self, extracted_scales, reference_scales): + for name, reference_value in reference_scales.items(): + self.assertIn(name, extracted_scales, f"Scale {name} not found in model") + extracted_value = extracted_scales[name] + self.assertTrue( + torch.allclose(extracted_value, reference_value), + f"Scale {name} mismatch {extracted_value}", + ) + @patch("executorch.backends.openvino.quantizer.llm_compression.get_tokenizer") @patch( "executorch.backends.openvino.quantizer.llm_compression.get_calibration_data" @@ -79,18 +151,7 @@ def _create_builder(self, config_name, calibration_data=None): def test_compression_flow_with_mocked_calibration( self, mock_get_calibration_data, mock_get_tokenizer ): - mock_calibration_data = [ - (0, 1), - (1, 5), - (2, 10), - (3, 15), - (4, 20), - (5, 25), - (6, 30), - (7, 35), - (8, 40), - (9, 45), - ] + mock_calibration_data = [(i, i) for i in range(5)] mock_get_calibration_data.return_value = mock_calibration_data mock_tokenizer = Mock() @@ -108,16 +169,13 @@ def test_compression_flow_with_mocked_calibration( config["name"], calibration_data=calibration_data ) builder.export() - import copy - original_model = copy.deepcopy(builder.pre_autograd_graph_module) - - test_input = torch.tensor([[5]], dtype=torch.long) + test_input = torch.tensor([[4]], dtype=torch.long) test_pos = torch.tensor([0], dtype=torch.long) - reference_output = original_model(test_input, {"input_pos": test_pos}) - + # Quantize weights for all layers(including embedding and lm_head which would by default be in INT8) + # to Per-Channel INT4 Symmetric quantizer = OpenVINOQuantizer( - mode=QuantizationMode.INT4WO_SYM, group_size=-1 + mode=QuantizationMode.INT4WO_SYM, group_size=-1, all_layers=True ) builder = apply_nncf_data_aware_compression( builder, @@ -125,17 +183,18 @@ def test_compression_flow_with_mocked_calibration( awq=config["awq"], scale_estimation=config["scale_estimation"], ) - - compressed_output = builder.pre_autograd_graph_module( - test_input, {"input_pos": test_pos} + # Run the model to check it is performant + builder.pre_autograd_graph_module(test_input, {"input_pos": test_pos}) + extracted_scales = self._extract_scales_from_model( + builder.pre_autograd_graph_module + ) + self._compare_scales( + extracted_scales, + self.reference_scales[config["name"]], ) - - torch.allclose(compressed_output, reference_output) class TestCalibrationDataGeneration(unittest.TestCase): - """Test the calibration data generation method. We first create a mock tokenizer - and then compare it with a reference created manually""" def test_get_calibration_data_with_mock_module(self): mock_tokenizer = Mock() @@ -146,8 +205,21 @@ def test_get_calibration_data_with_mock_module(self): mock_module.return_value = torch.tensor([[[0.1, 0.2, 0.9, 0.0]]]) result = get_calibration_data( - mock_module, mock_tokenizer, "test prompt", max_len=10 # Will be mocked + mock_module, mock_tokenizer, "test prompt", max_len=10 ) positions = [item[0] for item in result] self.assertEqual(positions, list(range(len(positions)))) + + def test_transform_fn(self): + token_pos_map = (5, 10) + result = transform_fn(token_pos_map) + + self.assertIsInstance(result, tuple) + self.assertEqual(len(result), 2) + + token, input_pos_dict = result + self.assertEqual(token.shape, torch.Size([1, 1])) + self.assertEqual(token, torch.tensor([[10]])) + self.assertIn("input_pos", input_pos_dict) + self.assertEqual(input_pos_dict["input_pos"], torch.tensor([5])) From 019b2cce0b9a8a5ce2ccf1e3e7954f0f6161ea22 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Thu, 22 Jan 2026 16:47:05 +0400 Subject: [PATCH 12/17] remove dead code --- backends/openvino/quantizer/llm_compression.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py index a7ca5ca8f09..c4862161aa0 100644 --- a/backends/openvino/quantizer/llm_compression.py +++ b/backends/openvino/quantizer/llm_compression.py @@ -113,7 +113,6 @@ def apply_nncf_data_aware_compression( missing_params.append("calibration_seq_length") if builder_exported.tokenizer_path is None: missing_params.append("tokenizer_path") - msg = "Scale Estimation is enabled but no calibration dataset is provided." if missing_params: msg += ( " Missing required calibration parameter(s): " From 562261f0bab0ce86e6f527616564a1788f4f7944 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 3 Feb 2026 14:35:12 +0400 Subject: [PATCH 13/17] lint fixes --- backends/openvino/quantizer/llm_compression.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py index c4862161aa0..86b45d55ff2 100644 --- a/backends/openvino/quantizer/llm_compression.py +++ b/backends/openvino/quantizer/llm_compression.py @@ -95,8 +95,8 @@ def apply_nncf_data_aware_compression( ): tokenizer = get_tokenizer(builder_exported.tokenizer_path) nncf_calibration_data = nncf.Dataset( - get_calibration_data( # type: ignore[arg-type] - builder_exported.pre_autograd_graph_module, + get_calibration_data( + builder_exported.pre_autograd_graph_module, # type: ignore[arg-type] tokenizer, builder_exported.calibration_data, builder_exported.calibration_seq_length, @@ -114,11 +114,12 @@ def apply_nncf_data_aware_compression( if builder_exported.tokenizer_path is None: missing_params.append("tokenizer_path") if missing_params: - msg += ( + msg = ( " Missing required calibration parameter(s): " + ", ".join(missing_params) + ". Please provide calibration_data, calibration_seq_length, and tokenizer_path." ) + raise ValueError(msg) builder_exported.pre_autograd_graph_module = ( nncf.experimental.torch.fx.compress_pt2e( From ecd5b8a85d353658d867fa62e21a3a721c13e522 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 3 Feb 2026 14:35:28 +0400 Subject: [PATCH 14/17] extend test for error --- .../tests/quantizer/test_llm_compression.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py index 252f9cd6875..e06a750a0f9 100644 --- a/backends/openvino/tests/quantizer/test_llm_compression.py +++ b/backends/openvino/tests/quantizer/test_llm_compression.py @@ -193,6 +193,28 @@ def test_compression_flow_with_mocked_calibration( self.reference_scales[config["name"]], ) + def test_scale_estimation_requires_calibration_params(self): + builder = self._create_builder("missing_calibration_data", calibration_data=None) + builder.export() + + quantizer = OpenVINOQuantizer( + mode=QuantizationMode.INT4WO_SYM, group_size=-1, all_layers=True + ) + + with self.assertRaises(ValueError) as cm: + apply_nncf_data_aware_compression( + builder, + quantizer=quantizer, + awq=False, + scale_estimation=True, + ) + + err = str(cm.exception) + self.assertIn("Missing required calibration parameter(s)", err) + self.assertIn("calibration_data", err) + self.assertIn("calibration_seq_length", err) + self.assertIn("tokenizer_path", err) + class TestCalibrationDataGeneration(unittest.TestCase): From d72466d8c8e5d79729d26146696cb7cb20c3438f Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 3 Feb 2026 14:49:40 +0400 Subject: [PATCH 15/17] lint --- backends/openvino/tests/quantizer/test_llm_compression.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py index e06a750a0f9..6dfef1fb600 100644 --- a/backends/openvino/tests/quantizer/test_llm_compression.py +++ b/backends/openvino/tests/quantizer/test_llm_compression.py @@ -194,7 +194,9 @@ def test_compression_flow_with_mocked_calibration( ) def test_scale_estimation_requires_calibration_params(self): - builder = self._create_builder("missing_calibration_data", calibration_data=None) + builder = self._create_builder( + "missing_calibration_data", calibration_data=None + ) builder.export() quantizer = OpenVINOQuantizer( From 83f0fb8910e757746dd250128edd47dd4a1883d5 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 4 Feb 2026 11:36:53 +0400 Subject: [PATCH 16/17] remove leading space in error message --- backends/openvino/quantizer/llm_compression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py index 86b45d55ff2..1737f638bf9 100644 --- a/backends/openvino/quantizer/llm_compression.py +++ b/backends/openvino/quantizer/llm_compression.py @@ -115,7 +115,7 @@ def apply_nncf_data_aware_compression( missing_params.append("tokenizer_path") if missing_params: msg = ( - " Missing required calibration parameter(s): " + "Missing required calibration parameter(s): " + ", ".join(missing_params) + ". Please provide calibration_data, calibration_seq_length, and tokenizer_path." ) From 0093592df4a3f7cedf803207c656e74b6e8ee7d6 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Tue, 24 Feb 2026 18:22:56 +0400 Subject: [PATCH 17/17] update nncf version to 3.0.0 --- backends/openvino/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt index 814b0d68fff..ba338416583 100644 --- a/backends/openvino/requirements.txt +++ b/backends/openvino/requirements.txt @@ -1 +1 @@ -git+https://github.com/openvinotoolkit/nncf@88445b32d6a622e177945c331beef95e222dbe00 +nncf==3.0.0