From ece4a0b0b2a945225cac471407add249f19c70a1 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Fri, 16 Jan 2026 21:59:15 +0400
Subject: [PATCH 01/17] Extend quantizer to support compress_pt2e

---
 backends/openvino/quantizer/quantizer.py | 106 +++++++++++++++++++----
 1 file changed, 90 insertions(+), 16 deletions(-)

diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index 5766013689b..28042b5a5f3 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -13,7 +13,6 @@
 import nncf  # type: ignore[import-untyped]
 import nncf.common.quantization as quantization  # type: ignore[import-untyped]
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
-
 import torch.fx
 from executorch.backends.openvino.quantizer.observers import (
     INT4WeightObserver,
@@ -78,12 +77,12 @@ class OpenVINOQuantizer(Quantizer):
     optimally for the inference via OpenVINO.
     """
 
-    WEIGHTS_ONLY_COMPRESSION_MODES = (
-        QuantizationMode.INT4WO_SYM,
-        QuantizationMode.INT4WO_ASYM,
-        QuantizationMode.INT8WO_SYM,
-        QuantizationMode.INT8WO_ASYM,
-    )
+    WEIGHTS_ONLY_COMPRESSION_MODES = {
+        QuantizationMode.INT4WO_SYM: "int4_sym",
+        QuantizationMode.INT4WO_ASYM: "int4_asym",
+        QuantizationMode.INT8WO_SYM: "int8_sym",
+        QuantizationMode.INT8WO_ASYM: "int8_asym",
+    }
 
     def __init__(
         self,
@@ -116,17 +115,63 @@ def __init__(
                 preset=preset, model_type=model_type, **kwargs
             )
         else:
-            compression_mode = mode.value.replace(
-                "wo", ""
-            )  # Mode value has to match NNCF CompressWeightsMode
+            compression_mode = OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES[
+                mode
+            ]  # Mode value has to match NNCF CompressWeightsMode
             weight_compression_configuration = get_weight_compression_configuration(
                 nncf.CompressWeightsMode(compression_mode),
                 **kwargs,
             )
-            subset_size = 1  # Doesn't really matter in this case since it is data-free. Should just be +ve
+            weight_compression_configuration["subset_size"] = (
+                1  # Doesn't really matter in this case since it is data-free. Should just be +ve
+            )
+
             self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
-                subset_size=subset_size, **weight_compression_configuration
+                **weight_compression_configuration
+            )
+
+    def _require_wc_algo(
+        self,
+    ) -> nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression:
+        if not isinstance(
+            self._algo,
+            nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression,
+        ):
+            raise TypeError(
+                "This method requires WeightCompression algo, but "
+                f"got {type(self._algo).__name__} (mode={self.mode})."
             )
+        return self._algo
+
+    def _require_ptq_algo(self) -> MinMaxQuantization:
+        if not isinstance(self._algo, MinMaxQuantization):
+            raise TypeError(
+                "This method requires MinMaxQuantization algo, but "
+                f"got {type(self._algo).__name__} (mode={self.mode})."
+            )
+        return self._algo
+
+    def get_weights_compression_config(self) -> Dict[str, Any]:
+        """
+        Returns a dictionary with all_layers, group_size, backup_mode and Quantization mode parameters
+        used by the compress_pt2e weight compression algorithm.
+
+        :return: A dictionary containing:
+            1. mode: Quantization mode. One of INT4 Sym, INT4 Asym, INT8 Sym, INT8 Asym.
+            2. group_size: group size to be used for group-wise compression.
+            3. all_layers: Indicates whether embeddings and last MatMul layers should be compressed to a primary
+                precision. By default, the backup precision is assigned for the embeddings and last MatMul layers.
+            4. backup_mode: Defines a backup mode for mixed-precision weight compression.
+        """
+        algo = self._require_wc_algo()
+        quantizer_initialized_algo_attributes = {
+            "mode": algo.mode,
+            "group_size": algo.group_size,
+            "all_layers": algo.all_layers,
+            "backup_mode": algo.backup_mode,
+        }
+
+        return quantizer_initialized_algo_attributes
 
     def set_ignored_scope(
         self,
@@ -160,8 +205,32 @@ def set_ignored_scope(
     def get_nncf_quantization_setup(
         self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
     ) -> quantization.quantizer_setup.SingleConfigQuantizerSetup:
-        self._algo._set_backend_entity(model)
-        return self._algo.find_quantization_setup(model, nncf_graph)
+        algo = self._require_ptq_algo()
+        algo._set_backend_entity(model)
+        return algo.find_quantization_setup(model, nncf_graph)
+
+    def get_nncf_weight_compression_parameters(
+        self,
+        model: torch.fx.GraphModule,
+        nncf_graph: NNCFGraph,
+    ) -> Tuple[
+        List[WeightCompressionParameters],
+        List[WeightCompressionParameters],
+        List[WeightCompressionParameters],
+    ]:
+        """
+        Collect weight compression parameters for the given FX model and NNCF graph.
+
+        :param model: FX GraphModule to analyze for weight compression.
+        :param nncf_graph: NNCFGraph representation of the model.
+        :return: A tuple of:
+            - all parameters eligible for weight compression,
+            - ratio-defining parameters used to set primary/backup precisions,
+            - parameters that are not compressible and remain in original precision.
+        """
+        algo = self._require_wc_algo()
+        algo.set_backend_entity(model)
+        return algo.get_weight_compression_parameters(model, nncf_graph)
 
     def _annotate_weight_compression(
         self,
@@ -182,12 +251,17 @@ def _annotate_weight_compression(
         :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
         :return: Updated mapping of FX nodes with weight compression annotations.
         """
-        self._algo.set_backend_entity(model)
-        all_wc_params, _ = self._algo.get_weight_compression_parameters(
+        all_wc_params, *_ = self.get_nncf_weight_compression_parameters(
             model, nncf_graph
         )
 
         for wc_param in all_wc_params:
+            if not wc_param.compression_config:
+                nncf_logger.debug(
+                    "Skipping weight compression for node '%s' because compression_config is missing.",
+                    getattr(wc_param.node_with_weight, "node_name", "<unknown>"),
+                )
+                continue
             node_with_weight = wc_param.node_with_weight
             target_node = nncf_fx.node_utils.get_graph_node_by_name(
                 graph, node_with_weight.node_name

From 9cc099161595a6d9c4fb8d07070291314a18ea3d Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Fri, 16 Jan 2026 22:00:09 +0400
Subject: [PATCH 02/17] integrate compress_pt2e into the example

---
 backends/openvino/quantization/__init__.py    |   3 +
 .../openvino/quantization/nncf_compression.py |  79 ++++++++++++
 backends/openvino/quantizer/__init__.py       |   8 +-
 .../openvino/quantizer/llm_compression.py     | 112 ++++++++++++++++++
 backends/openvino/requirements.txt            |   2 +-
 examples/models/llama/export_llama_lib.py     |  30 ++++-
 extension/llm/export/config/llm_config.py     |  11 +-
 7 files changed, 235 insertions(+), 10 deletions(-)
 create mode 100644 backends/openvino/quantization/__init__.py
 create mode 100644 backends/openvino/quantization/nncf_compression.py
 create mode 100644 backends/openvino/quantizer/llm_compression.py

diff --git a/backends/openvino/quantization/__init__.py b/backends/openvino/quantization/__init__.py
new file mode 100644
index 00000000000..5bc3585a975
--- /dev/null
+++ b/backends/openvino/quantization/__init__.py
@@ -0,0 +1,3 @@
+from .nncf_compression import use_nncf_compression
+
+__all__ = ["use_nncf_compression"]
\ No newline at end of file
diff --git a/backends/openvino/quantization/nncf_compression.py b/backends/openvino/quantization/nncf_compression.py
new file mode 100644
index 00000000000..937e756f904
--- /dev/null
+++ b/backends/openvino/quantization/nncf_compression.py
@@ -0,0 +1,79 @@
+# Copyright (c) Intel Corporation
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: disable-error-code=import-not-found
+
+import torch
+
+try:
+    import nncf  # type: ignore[import-untyped]
+    from pytorch_tokenizers import get_tokenizer  # type: ignore[import-untyped]
+except ImportError:
+    raise ImportError("Please install nncf via backends/openvino/requirements.txt")
+
+
+def get_calibration_data(
+    module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
+):
+    # TODO: change criteria & support batch inputs if necessary
+    pos = torch.tensor(0, dtype=torch.int64)
+    token_list = tokenizer.encode(prompts, bos=True, eos=False)
+
+    with torch.no_grad():
+        while token_list[-1] != tokenizer.eos_id and pos < max_len:
+            logits = module(
+                torch.full((1, 1), token_list[pos]),
+                {"input_pos": torch.tensor((pos,))},
+            )
+            pos += 1
+            if pos >= len(token_list):
+                token_list.append(torch.argmax(logits[:], dim=-1).item())
+    token_list = [
+        (
+            pos,
+            token,
+        )
+        for pos, token in enumerate(token_list)
+    ]
+    return token_list
+
+
+def transform_fn(token_pos_map: tuple[int, str]):
+    # tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
+    inputs = ()
+    inputs = (
+        torch.tensor(token_pos_map[1]).unsqueeze(0).unsqueeze(0),
+        {"input_pos": torch.tensor([token_pos_map[0]])},
+    )
+
+    return inputs
+
+
+def apply_nncf_data_aware_compression(
+    builder_exported, quantizers, awq: bool, scale_estimation: bool
+):
+    tokenizer = get_tokenizer(builder_exported.tokenizer_path)
+
+    builder_exported.calibration_data = get_calibration_data(
+        builder_exported.pre_autograd_graph_module,
+        tokenizer,
+        builder_exported.calibration_data,
+        builder_exported.max_seq_len,
+    )
+
+    builder_exported.pre_autograd_graph_module = (
+        nncf.experimental.torch.fx.compress_pt2e(
+            builder_exported.pre_autograd_graph_module,
+            quantizer=quantizers[0],
+            dataset=nncf.Dataset(
+                builder_exported.calibration_data,
+                transform_func=transform_fn,
+            ),
+            awq=awq,
+            scale_estimation=scale_estimation,
+        )
+    )
+    return builder_exported
\ No newline at end of file
diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py
index 5aae52ef3e8..e819aaf5159 100644
--- a/backends/openvino/quantizer/__init__.py
+++ b/backends/openvino/quantizer/__init__.py
@@ -1,3 +1,9 @@
+from .llm_compression import apply_nncf_data_aware_compression
 from .quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model
 
-__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]
+__all__ = [
+    "OpenVINOQuantizer",
+    "quantize_model",
+    "QuantizationMode",
+    "apply_nncf_data_aware_compression",
+]
diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
new file mode 100644
index 00000000000..04f29df4d03
--- /dev/null
+++ b/backends/openvino/quantizer/llm_compression.py
@@ -0,0 +1,112 @@
+# Copyright (c) Intel Corporation
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: disable-error-code=import-not-found
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from executorch.extension.llm.export.builder import LLMEdgeManager
+from torchao.quantization.pt2e.quantizer import Quantizer
+
+try:
+    import nncf  # type: ignore[import-untyped]
+    from pytorch_tokenizers import get_tokenizer  # type: ignore[import-untyped]
+except ImportError:
+    raise ImportError("Please install nncf via backends/openvino/requirements.txt")
+
+
+# This code is taken from https://github.com/pytorch/executorch/blob/0c54fd0483314da173f8e14d63d2ed9591c7133a/extension/llm/export/builder.py#L278
+def get_calibration_data(
+    module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
+):
+    """
+    This method is used to obtain calibration data from a prompt so that the algorithm
+    is calibrated not only with the dataset but also the inputs which are output by
+    the model.
+    Currently, this method is only tested with Llama models.
+    """
+    # TODO: change criteria & support batch inputs if necessary
+    pos = torch.tensor(0, dtype=torch.int64)
+    token_list = tokenizer.encode(prompts, bos=True, eos=False)
+
+    with torch.no_grad():
+        while token_list[-1] != tokenizer.eos_id and pos < max_len:
+            logits = module(
+                torch.full((1, 1), token_list[pos]),
+                {"input_pos": torch.tensor((pos,))},
+            )
+            pos += 1
+            if pos >= len(token_list):
+                token_list.append(torch.argmax(logits[:], dim=-1).item())
+    token_list = [
+        (
+            pos,
+            token,
+        )
+        for pos, token in enumerate(token_list)
+    ]
+    return token_list
+
+
+def transform_fn(token_pos_map: Tuple[int, int]):
+    """
+    Transforms and returns input from dataset so that it is acceptable by the model
+    Currently, this method is only tested with Llama models.
+
+    :param token_pos_map: This input contains the position and its token ID
+    """
+    inputs = (
+        torch.tensor(token_pos_map[1]).unsqueeze(0).unsqueeze(0),
+        {"input_pos": torch.tensor([token_pos_map[0]])},
+    )
+
+    return inputs
+
+
+def apply_nncf_data_aware_compression(
+    builder_exported: LLMEdgeManager,
+    quantizer: Quantizer,
+    awq: bool,
+    scale_estimation: bool,
+) -> LLMEdgeManager:
+    """
+    Applies NNCF data-aware weight compression to the exported LLM graph.
+    Uses the builder's tokenizer and calibration prompt to generate token-level
+    calibration data, then runs `nncf.experimental.torch.fx.compress_pt2e` with
+    the given quantizer and optional AWQ / scale estimation enabled.
+
+    :param builder_exported: LLMEdgeManager containing the FX graph, tokenizer path,
+        calibration prompt, and max sequence length.
+    :param quantizer: TorchAO quantizer to use for compression.
+    :param awq: If True, enables Activation-aware Weights Quantization (AWQ).
+    :param scale_estimation: If True, enables NNCF's scale estimation algorithm.
+    :return: The updated LLMEdgeManager with compressed torch FX model
+    """
+    tokenizer = get_tokenizer(builder_exported.tokenizer_path)
+
+    nncf_calibration_data = None
+    if awq or scale_estimation:
+        nncf_calibration_data = nncf.Dataset(
+            get_calibration_data(
+                builder_exported.pre_autograd_graph_module,
+                tokenizer,
+                builder_exported.calibration_data,
+                builder_exported.max_seq_len,
+            ),
+            transform_func=transform_fn,
+        )
+
+    builder_exported.pre_autograd_graph_module = (
+        nncf.experimental.torch.fx.compress_pt2e(
+            builder_exported.pre_autograd_graph_module,
+            quantizer=quantizer,
+            dataset=nncf_calibration_data,
+            awq=awq,
+            scale_estimation=scale_estimation,
+        )
+    )
+    return builder_exported
diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index 519818d0aac..ff7a72318ec 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1,2 +1,2 @@
 transformers
-git+https://github.com/openvinotoolkit/nncf@3d753ac#egg=nncf
+git+https://github.com/openvinotoolkit/nncf@88445b32d6a622e177945c331beef95e222dbe00
\ No newline at end of file
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 219cc71ded1..87c87972d9a 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -16,14 +16,12 @@
 import re
 import shlex
 from functools import partial
-
 from importlib import resources as _resources
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Callable, List, Optional, Union
 
 import torch
-
 from executorch.devtools.backend_debug import print_delegation_info
 from executorch.devtools.etrecord import generate_etrecord as generate_etrecord_func
 from executorch.examples.models.llama.hf_download import (
@@ -241,6 +239,19 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="Path to the adapter_config.json file. Used if the model has trained LoRA adapters. Must provide adapter_checkpoint.",
     )
 
+    parser.add_argument(
+        "--openvino_awq",
+        required=False,
+        action="store_true",
+        help="Whether to use AWQ from NNCF. Applicable only for the OpenVINO backend.",
+    )
+
+    parser.add_argument(
+        "--openvino_scale_estimation",
+        action="store_true",
+        help="Whether to use Scale Estimation algorithm from NNCF. Applicable only for the OpenVINO backend",
+    )
+
     parser.add_argument(
         "--use_qnn_sha",
         action="store_true",
@@ -775,7 +786,7 @@ def get_quantizer_and_quant_params(llm_config):
         )
         quantizers.append(qnn_quantizer)
     if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
-        assert not quantizers, "Should not enable both xnnpack and openvino"
+        assert not quantizers, "Should not enable openvino and other quantizers"
         group_size = llm_config.quantization.group_size
         group_size = group_size if group_size else 128
         ov_quantizer = get_ov_quantizer(
@@ -907,6 +918,8 @@ def _to_edge_and_lower_llama_openvino(
     modelname,
     quantizers,
     additional_passes,
+    awq,
+    scale_estimation,
     openvino_device: str = "CPU",
     verbose: bool = False,
 ) -> LLMEdgeManager:  # noqa: C901
@@ -920,10 +933,15 @@ def _to_edge_and_lower_llama_openvino(
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
 
-    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
-        partitioners
+    from executorch.backends.openvino.quantizer import apply_nncf_data_aware_compression
+
+    logging.info(f"Applying AWQ = {awq}, Scale Estimation = {scale_estimation}")
+    builder = apply_nncf_data_aware_compression(
+        builder_exported, quantizers[0], awq, scale_estimation
     )
 
+    builder = builder.to_edge_transform_and_lower(partitioners)
+
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
 
@@ -1159,6 +1177,8 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             modelname,
             quantizers,
             additional_passes,
+            awq=llm_config.backend.openvino.openvino_awq,
+            scale_estimation=llm_config.backend.openvino.openvino_scale_estimation,
             openvino_device=llm_config.backend.openvino.device,
             verbose=llm_config.debug.verbose,
         )
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index b40fad88a9c..3280e674dec 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -460,8 +460,9 @@ class OpenvinoConfig:
 
     enabled: bool = False
     device: str = "CPU"
-    nncf_compression: bool = False
     nncf_compression_group_size: int = 32
+    openvino_awq: bool = False
+    openvino_scale_estimation: bool = False
 
 
 @dataclass
@@ -659,8 +660,12 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.backend.openvino.enabled = args.openvino
         if hasattr(args, "openvino_device"):
             llm_config.backend.openvino.device = args.openvino_device
-        if hasattr(args, "nncf_compression"):
-            llm_config.backend.openvino.nncf_compression = args.nncf_compression
+        if hasattr(args, "openvino_awq"):
+            llm_config.backend.openvino.openvino_awq = args.openvino_awq
+        if hasattr(args, "openvino_scale_estimation"):
+            llm_config.backend.openvino.openvino_scale_estimation = (
+                args.openvino_scale_estimation
+            )
         if hasattr(args, "group_size") and args.group_size:
             llm_config.backend.openvino.nncf_compression_group_size = args.group_size
 

From 6c0d7663c70462950cc4d1eb558b5fffb2c527a3 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 Jan 2026 20:39:58 +0400
Subject: [PATCH 03/17] remove extra directories

---
 backends/openvino/quantization/__init__.py    |  3 -
 .../openvino/quantization/nncf_compression.py | 79 -------------------
 2 files changed, 82 deletions(-)
 delete mode 100644 backends/openvino/quantization/__init__.py
 delete mode 100644 backends/openvino/quantization/nncf_compression.py

diff --git a/backends/openvino/quantization/__init__.py b/backends/openvino/quantization/__init__.py
deleted file mode 100644
index 5bc3585a975..00000000000
--- a/backends/openvino/quantization/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .nncf_compression import use_nncf_compression
-
-__all__ = ["use_nncf_compression"]
\ No newline at end of file
diff --git a/backends/openvino/quantization/nncf_compression.py b/backends/openvino/quantization/nncf_compression.py
deleted file mode 100644
index 937e756f904..00000000000
--- a/backends/openvino/quantization/nncf_compression.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Intel Corporation
-#
-# Licensed under the BSD License (the "License"); you may not use this file
-# except in compliance with the License. See the license file found in the
-# LICENSE file in the root directory of this source tree.
-
-# mypy: disable-error-code=import-not-found
-
-import torch
-
-try:
-    import nncf  # type: ignore[import-untyped]
-    from pytorch_tokenizers import get_tokenizer  # type: ignore[import-untyped]
-except ImportError:
-    raise ImportError("Please install nncf via backends/openvino/requirements.txt")
-
-
-def get_calibration_data(
-    module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
-):
-    # TODO: change criteria & support batch inputs if necessary
-    pos = torch.tensor(0, dtype=torch.int64)
-    token_list = tokenizer.encode(prompts, bos=True, eos=False)
-
-    with torch.no_grad():
-        while token_list[-1] != tokenizer.eos_id and pos < max_len:
-            logits = module(
-                torch.full((1, 1), token_list[pos]),
-                {"input_pos": torch.tensor((pos,))},
-            )
-            pos += 1
-            if pos >= len(token_list):
-                token_list.append(torch.argmax(logits[:], dim=-1).item())
-    token_list = [
-        (
-            pos,
-            token,
-        )
-        for pos, token in enumerate(token_list)
-    ]
-    return token_list
-
-
-def transform_fn(token_pos_map: tuple[int, str]):
-    # tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
-    inputs = ()
-    inputs = (
-        torch.tensor(token_pos_map[1]).unsqueeze(0).unsqueeze(0),
-        {"input_pos": torch.tensor([token_pos_map[0]])},
-    )
-
-    return inputs
-
-
-def apply_nncf_data_aware_compression(
-    builder_exported, quantizers, awq: bool, scale_estimation: bool
-):
-    tokenizer = get_tokenizer(builder_exported.tokenizer_path)
-
-    builder_exported.calibration_data = get_calibration_data(
-        builder_exported.pre_autograd_graph_module,
-        tokenizer,
-        builder_exported.calibration_data,
-        builder_exported.max_seq_len,
-    )
-
-    builder_exported.pre_autograd_graph_module = (
-        nncf.experimental.torch.fx.compress_pt2e(
-            builder_exported.pre_autograd_graph_module,
-            quantizer=quantizers[0],
-            dataset=nncf.Dataset(
-                builder_exported.calibration_data,
-                transform_func=transform_fn,
-            ),
-            awq=awq,
-            scale_estimation=scale_estimation,
-        )
-    )
-    return builder_exported
\ No newline at end of file

From f9c782b3ef3a334a0b25fa91fc72681d672ebdfe Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 21 Jan 2026 20:12:42 +0400
Subject: [PATCH 04/17] review changes

---
 backends/openvino/quantizer/llm_compression.py | 12 ++++++++----
 backends/openvino/requirements.txt             |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
index 04f29df4d03..fca93b74831 100644
--- a/backends/openvino/quantizer/llm_compression.py
+++ b/backends/openvino/quantizer/llm_compression.py
@@ -86,16 +86,20 @@ def apply_nncf_data_aware_compression(
     :param scale_estimation: If True, enables NNCF's scale estimation algorithm.
     :return: The updated LLMEdgeManager with compressed torch FX model
     """
-    tokenizer = get_tokenizer(builder_exported.tokenizer_path)
-
     nncf_calibration_data = None
-    if awq or scale_estimation:
+    if (
+            builder_exported.calibration_seq_length is not None
+            and builder_exported.calibration_data is not None
+            and builder_exported.tokenizer_path is not None
+            and (awq or scale_estimation)
+        ):
+        tokenizer = get_tokenizer(builder_exported.tokenizer_path)
         nncf_calibration_data = nncf.Dataset(
             get_calibration_data(
                 builder_exported.pre_autograd_graph_module,
                 tokenizer,
                 builder_exported.calibration_data,
-                builder_exported.max_seq_len,
+                builder_exported.calibration_seq_length,
             ),
             transform_func=transform_fn,
         )
diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index ff7a72318ec..208f11c4388 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1,2 +1,2 @@
 transformers
-git+https://github.com/openvinotoolkit/nncf@88445b32d6a622e177945c331beef95e222dbe00
\ No newline at end of file
+git+https://github.com/openvinotoolkit/nncf@88445b32d6a622e177945c331beef95e222dbe00

From 24f684fb37d249e42995319978075890d9ebbb2a Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 21 Jan 2026 20:52:08 +0400
Subject: [PATCH 05/17] lint

---
 backends/openvino/quantizer/llm_compression.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
index fca93b74831..96c185e38ca 100644
--- a/backends/openvino/quantizer/llm_compression.py
+++ b/backends/openvino/quantizer/llm_compression.py
@@ -88,11 +88,11 @@ def apply_nncf_data_aware_compression(
     """
     nncf_calibration_data = None
     if (
-            builder_exported.calibration_seq_length is not None
-            and builder_exported.calibration_data is not None
-            and builder_exported.tokenizer_path is not None
-            and (awq or scale_estimation)
-        ):
+        builder_exported.calibration_seq_length is not None
+        and builder_exported.calibration_data is not None
+        and builder_exported.tokenizer_path is not None
+        and (awq or scale_estimation)
+    ):
         tokenizer = get_tokenizer(builder_exported.tokenizer_path)
         nncf_calibration_data = nncf.Dataset(
             get_calibration_data(

From 0963b73e9cf479f9da1c280836ebc8581de55a0b Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 21 Jan 2026 23:38:08 +0400
Subject: [PATCH 06/17] add unit test

---
 backends/openvino/tests/README.md             |   3 +
 .../tests/quantizer/synthetic_test_models.py  |  21 +++
 .../tests/quantizer/test_llm_compression.py   | 136 ++++++++++++++++++
 backends/openvino/tests/test_runner.py        |   8 +-
 4 files changed, 164 insertions(+), 4 deletions(-)
 create mode 100644 backends/openvino/tests/quantizer/synthetic_test_models.py
 create mode 100644 backends/openvino/tests/quantizer/test_llm_compression.py

diff --git a/backends/openvino/tests/README.md b/backends/openvino/tests/README.md
index 0aad14e04a0..b5624a8ca15 100644
--- a/backends/openvino/tests/README.md
+++ b/backends/openvino/tests/README.md
@@ -11,6 +11,8 @@ backends/openvino/tests
     └── test_<op_name>.py               # Individual op tests scripts.
 ├── models                              # Directory with model test scripts.
     └── test_classification.py          # Test script for classification models.
+├── quantizer                           # Directory with quantizer test scripts.
+    └── test_llm_compression.py         # Test script for llm compression using NNCF algorithms.
 ├── README.md                           # Documentation for unit tests (this file)
 └── test_runner.py                      # Script to execute unit tests.
 ```
@@ -31,6 +33,7 @@ Before you begin, refer to instructions provided in [OpenVINO Backend for ExecuT
   Supported values:
   - `ops` (default)
   - `models`
+  - `quantizer`
 
 - **`--pattern`** (optional):  
   Pattern to match test files. Provide complete file name to run individual tests. The default value is `test_*.py`
diff --git a/backends/openvino/tests/quantizer/synthetic_test_models.py b/backends/openvino/tests/quantizer/synthetic_test_models.py
new file mode 100644
index 00000000000..6bdc3650a41
--- /dev/null
+++ b/backends/openvino/tests/quantizer/synthetic_test_models.py
@@ -0,0 +1,21 @@
+import torch
+
+class SimpleTransformer(torch.nn.Module):    
+    def __init__(self, vocab_size=100, hidden_size=64, num_layers=2):
+        super().__init__()
+        self.embed = torch.nn.Embedding(vocab_size, hidden_size)
+        self.layers = torch.nn.ModuleList([
+            torch.nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)
+        ])
+        self.lm_head = torch.nn.Linear(hidden_size, vocab_size)
+        self.vocab_size = vocab_size
+        
+    def forward(self, tokens, input_pos):
+        x = self.embed(tokens)
+        
+        for layer in self.layers:
+            x = torch.relu(layer(x))
+        
+        logits = self.lm_head(x)
+        
+        return logits
diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py
new file mode 100644
index 00000000000..a1ce2c8d2b8
--- /dev/null
+++ b/backends/openvino/tests/quantizer/test_llm_compression.py
@@ -0,0 +1,136 @@
+import unittest
+from unittest.mock import patch, Mock
+import torch
+from executorch.extension.llm.export.builder import LLMEdgeManager
+
+from executorch.backends.openvino.quantizer.llm_compression import (
+    apply_nncf_data_aware_compression,
+    get_calibration_data,
+)
+from executorch.backends.openvino.quantizer import (
+    OpenVINOQuantizer,
+    QuantizationMode,
+)
+from synthetic_test_models import SimpleTransformer
+
+class TestWeightsOnlyQuantization(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_name = "llama"
+        cls.model_class_name = "Llama2Model"
+        cls.model = SimpleTransformer()
+        cls.model.eval()
+
+        cls.max_seq_len = 128
+        cls.example_inputs = (torch.tensor([[1]], dtype=torch.long), {"input_pos": torch.tensor([0], dtype=torch.long)})
+        
+        cls.compression_configs = [
+            {
+                "name": "awq_only",
+                "awq": True,
+                "scale_estimation": False,
+            },
+            {
+                "name": "scale_estimation_only",
+                "awq": False,
+                "scale_estimation": True,
+            },
+            {
+                "name": "awq_and_scale_estimation",
+                "awq": True,
+                "scale_estimation": True,
+            },
+            {
+                "name": "no_calibration",
+                "awq": False,
+                "scale_estimation": False,
+            },
+        ]
+        
+        cls.calibration_data = "The quick brown fox jumps over the lazy dog."
+
+    def _create_builder(self, config_name, calibration_data=None):
+        builder_kwargs = {
+            "model": self.model,
+            "modelname": f"tinyllama_{config_name}",
+            "max_seq_len": self.max_seq_len,
+            "use_kv_cache": True,
+            "example_inputs": self.example_inputs,
+            "example_kwarg_inputs": None,
+        }
+
+        if calibration_data:
+            builder_kwargs.update({
+                "calibration_seq_length": 32,
+                "calibration_data": calibration_data,
+                "tokenizer_path": "dummy_path", # Will be mocked
+            })
+        
+        return LLMEdgeManager(**builder_kwargs)
+
+
+    @patch('executorch.backends.openvino.quantizer.llm_compression.get_tokenizer')
+    @patch('executorch.backends.openvino.quantizer.llm_compression.get_calibration_data')
+    def test_compression_flow_with_mocked_calibration(
+        self, mock_get_calibration_data, mock_get_tokenizer
+    ):
+        mock_calibration_data = [
+            (0, 1), (1, 5), (2, 10), (3, 15), (4, 20),
+            (5, 25), (6, 30), (7, 35), (8, 40), (9, 45)
+        ]
+        mock_get_calibration_data.return_value = mock_calibration_data
+        
+        mock_tokenizer = Mock()
+        mock_get_tokenizer.return_value = mock_tokenizer
+        
+        for config in self.compression_configs:
+            with self.subTest(phase="compression_config", config=config["name"]):
+                calibration_data = self.calibration_data if config["awq"] or config["scale_estimation"] else None
+
+                builder = self._create_builder(
+                    config["name"], 
+                    calibration_data=calibration_data
+                )
+                builder.export()
+                import copy
+                original_model = copy.deepcopy(builder.pre_autograd_graph_module)
+
+                test_input = torch.tensor([[5]], dtype=torch.long)
+                test_pos = torch.tensor([0], dtype=torch.long)
+                reference_output = original_model(test_input, {"input_pos": test_pos})
+
+                quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT4WO_SYM, group_size=-1)
+                builder = apply_nncf_data_aware_compression(
+                    builder,
+                    quantizer=quantizer,
+                    awq=config["awq"],
+                    scale_estimation=config["scale_estimation"],
+                )
+                
+                compressed_output = builder.pre_autograd_graph_module(test_input, {"input_pos": test_pos})
+
+                torch.allclose(compressed_output, reference_output)
+
+
+class TestCalibrationDataGeneration(unittest.TestCase):
+    """Test the calibration data generation method. We first create a mock tokenizer
+    and then compare it with a reference created manually"""
+
+    def test_get_calibration_data_with_mock_module(self):
+        # Create mock tokenizer
+        mock_tokenizer = Mock()
+        mock_tokenizer.eos_id = 2
+        mock_tokenizer.encode = Mock(return_value=[1, 5, 6])
+        
+        mock_module = Mock()
+        mock_module.return_value = torch.tensor([[[0.1, 0.2, 0.9, 0.0]]])
+        
+        result = get_calibration_data(
+            mock_module,
+            mock_tokenizer,
+            "test prompt", # Will be mocked
+            max_len=10
+        )
+
+        positions = [item[0] for item in result]
+        self.assertEqual(positions, list(range(len(positions))))
diff --git a/backends/openvino/tests/test_runner.py b/backends/openvino/tests/test_runner.py
index 021c372db25..bf744debd14 100644
--- a/backends/openvino/tests/test_runner.py
+++ b/backends/openvino/tests/test_runner.py
@@ -44,10 +44,10 @@ def parse_arguments():
     parser.add_argument(
         "-t",
         "--test_type",
-        help="Specify the type of tests ('ops' or 'models')",
+        help="Specify the type of tests ('ops', 'models' or 'quantizer')",
         type=str,
         default="ops",
-        choices={"ops", "models"},
+        choices={"ops", "models", "quantizer"},
     )
 
     args, ns_args = parser.parse_known_args(namespace=unittest)
@@ -68,8 +68,8 @@ def parse_arguments():
     # Discover all existing op tests in "ops" folder
     suite = loader.discover(test_params["test_type"], pattern=test_params["pattern"])
     # Start running tests
-    with nncf.torch.disable_patching():
-        result = unittest.TextTestRunner().run(suite)
+    # with nncf.torch.disable_patching():
+    result = unittest.TextTestRunner().run(suite)
     if result.wasSuccessful():
         print("OpenVINO backend tests completed successfully")
     else:

From 792caf21008fb24a38f908eca8dea7adb5097df2 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 21 Jan 2026 23:38:26 +0400
Subject: [PATCH 07/17] add some corner case checks in llm compression

---
 backends/openvino/quantizer/llm_compression.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
index 96c185e38ca..bb6287721a3 100644
--- a/backends/openvino/quantizer/llm_compression.py
+++ b/backends/openvino/quantizer/llm_compression.py
@@ -60,7 +60,7 @@ def transform_fn(token_pos_map: Tuple[int, int]):
     :param token_pos_map: This input contains the position and its token ID
     """
     inputs = (
-        torch.tensor(token_pos_map[1]).unsqueeze(0).unsqueeze(0),
+        torch.tensor([[token_pos_map[1]]]),
         {"input_pos": torch.tensor([token_pos_map[0]])},
     )
 
@@ -104,6 +104,11 @@ def apply_nncf_data_aware_compression(
             transform_func=transform_fn,
         )
 
+    # AWQ can work without a dataset as well.
+    if scale_estimation and not nncf_calibration_data:
+        msg = "Scale Estimation is enabled but no calibration dataset is provided"
+        raise RuntimeError(msg)
+
     builder_exported.pre_autograd_graph_module = (
         nncf.experimental.torch.fx.compress_pt2e(
             builder_exported.pre_autograd_graph_module,

From dc3b2191494eae590c4e4c1b9d6902d61ff7dc76 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 21 Jan 2026 23:50:16 +0400
Subject: [PATCH 08/17] clean unused imports

---
 backends/openvino/quantizer/llm_compression.py            | 2 +-
 backends/openvino/tests/quantizer/test_llm_compression.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
index bb6287721a3..d077e53fd96 100644
--- a/backends/openvino/quantizer/llm_compression.py
+++ b/backends/openvino/quantizer/llm_compression.py
@@ -6,7 +6,7 @@
 
 # mypy: disable-error-code=import-not-found
 
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Tuple
 
 import torch
 from executorch.extension.llm.export.builder import LLMEdgeManager
diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py
index a1ce2c8d2b8..84088248562 100644
--- a/backends/openvino/tests/quantizer/test_llm_compression.py
+++ b/backends/openvino/tests/quantizer/test_llm_compression.py
@@ -117,7 +117,6 @@ class TestCalibrationDataGeneration(unittest.TestCase):
     and then compare it with a reference created manually"""
 
     def test_get_calibration_data_with_mock_module(self):
-        # Create mock tokenizer
         mock_tokenizer = Mock()
         mock_tokenizer.eos_id = 2
         mock_tokenizer.encode = Mock(return_value=[1, 5, 6])

From 0d3d68160b1584fd7e12d6a88670120b2916a775 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Thu, 22 Jan 2026 00:06:01 +0400
Subject: [PATCH 09/17] lint

---
 .../tests/quantizer/synthetic_test_models.py  | 17 ++--
 .../tests/quantizer/test_llm_compression.py   | 88 +++++++++++--------
 2 files changed, 62 insertions(+), 43 deletions(-)

diff --git a/backends/openvino/tests/quantizer/synthetic_test_models.py b/backends/openvino/tests/quantizer/synthetic_test_models.py
index 6bdc3650a41..9e665fbdb3c 100644
--- a/backends/openvino/tests/quantizer/synthetic_test_models.py
+++ b/backends/openvino/tests/quantizer/synthetic_test_models.py
@@ -1,21 +1,22 @@
 import torch
 
-class SimpleTransformer(torch.nn.Module):    
+
+class SimpleTransformer(torch.nn.Module):
     def __init__(self, vocab_size=100, hidden_size=64, num_layers=2):
         super().__init__()
         self.embed = torch.nn.Embedding(vocab_size, hidden_size)
-        self.layers = torch.nn.ModuleList([
-            torch.nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)
-        ])
+        self.layers = torch.nn.ModuleList(
+            [torch.nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)]
+        )
         self.lm_head = torch.nn.Linear(hidden_size, vocab_size)
         self.vocab_size = vocab_size
-        
+
     def forward(self, tokens, input_pos):
         x = self.embed(tokens)
-        
+
         for layer in self.layers:
             x = torch.relu(layer(x))
-        
+
         logits = self.lm_head(x)
-        
+
         return logits
diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py
index 84088248562..6d4fbfb2492 100644
--- a/backends/openvino/tests/quantizer/test_llm_compression.py
+++ b/backends/openvino/tests/quantizer/test_llm_compression.py
@@ -1,17 +1,16 @@
 import unittest
-from unittest.mock import patch, Mock
+from unittest.mock import Mock, patch
+
 import torch
-from executorch.extension.llm.export.builder import LLMEdgeManager
+from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode
 
 from executorch.backends.openvino.quantizer.llm_compression import (
     apply_nncf_data_aware_compression,
     get_calibration_data,
 )
-from executorch.backends.openvino.quantizer import (
-    OpenVINOQuantizer,
-    QuantizationMode,
-)
-from synthetic_test_models import SimpleTransformer
+from executorch.extension.llm.export.builder import LLMEdgeManager
+from synthetic_test_models import SimpleTransformer  # type: ignore[import-not-found]
+
 
 class TestWeightsOnlyQuantization(unittest.TestCase):
     @classmethod
@@ -22,8 +21,11 @@ def setUpClass(cls):
         cls.model.eval()
 
         cls.max_seq_len = 128
-        cls.example_inputs = (torch.tensor([[1]], dtype=torch.long), {"input_pos": torch.tensor([0], dtype=torch.long)})
-        
+        cls.example_inputs = (
+            torch.tensor([[1]], dtype=torch.long),
+            {"input_pos": torch.tensor([0], dtype=torch.long)},
+        )
+
         cls.compression_configs = [
             {
                 "name": "awq_only",
@@ -46,7 +48,7 @@ def setUpClass(cls):
                 "scale_estimation": False,
             },
         ]
-        
+
         cls.calibration_data = "The quick brown fox jumps over the lazy dog."
 
     def _create_builder(self, config_name, calibration_data=None):
@@ -60,54 +62,73 @@ def _create_builder(self, config_name, calibration_data=None):
         }
 
         if calibration_data:
-            builder_kwargs.update({
-                "calibration_seq_length": 32,
-                "calibration_data": calibration_data,
-                "tokenizer_path": "dummy_path", # Will be mocked
-            })
-        
-        return LLMEdgeManager(**builder_kwargs)
+            builder_kwargs.update(
+                {
+                    "calibration_seq_length": 32,
+                    "calibration_data": calibration_data,
+                    "tokenizer_path": "dummy_path",  # Will be mocked
+                }
+            )
 
+        return LLMEdgeManager(**builder_kwargs)
 
-    @patch('executorch.backends.openvino.quantizer.llm_compression.get_tokenizer')
-    @patch('executorch.backends.openvino.quantizer.llm_compression.get_calibration_data')
+    @patch("executorch.backends.openvino.quantizer.llm_compression.get_tokenizer")
+    @patch(
+        "executorch.backends.openvino.quantizer.llm_compression.get_calibration_data"
+    )
     def test_compression_flow_with_mocked_calibration(
         self, mock_get_calibration_data, mock_get_tokenizer
     ):
         mock_calibration_data = [
-            (0, 1), (1, 5), (2, 10), (3, 15), (4, 20),
-            (5, 25), (6, 30), (7, 35), (8, 40), (9, 45)
+            (0, 1),
+            (1, 5),
+            (2, 10),
+            (3, 15),
+            (4, 20),
+            (5, 25),
+            (6, 30),
+            (7, 35),
+            (8, 40),
+            (9, 45),
         ]
         mock_get_calibration_data.return_value = mock_calibration_data
-        
+
         mock_tokenizer = Mock()
         mock_get_tokenizer.return_value = mock_tokenizer
-        
+
         for config in self.compression_configs:
             with self.subTest(phase="compression_config", config=config["name"]):
-                calibration_data = self.calibration_data if config["awq"] or config["scale_estimation"] else None
+                calibration_data = (
+                    self.calibration_data
+                    if config["awq"] or config["scale_estimation"]
+                    else None
+                )
 
                 builder = self._create_builder(
-                    config["name"], 
-                    calibration_data=calibration_data
+                    config["name"], calibration_data=calibration_data
                 )
                 builder.export()
                 import copy
+
                 original_model = copy.deepcopy(builder.pre_autograd_graph_module)
 
                 test_input = torch.tensor([[5]], dtype=torch.long)
                 test_pos = torch.tensor([0], dtype=torch.long)
                 reference_output = original_model(test_input, {"input_pos": test_pos})
 
-                quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT4WO_SYM, group_size=-1)
+                quantizer = OpenVINOQuantizer(
+                    mode=QuantizationMode.INT4WO_SYM, group_size=-1
+                )
                 builder = apply_nncf_data_aware_compression(
                     builder,
                     quantizer=quantizer,
                     awq=config["awq"],
                     scale_estimation=config["scale_estimation"],
                 )
-                
-                compressed_output = builder.pre_autograd_graph_module(test_input, {"input_pos": test_pos})
+
+                compressed_output = builder.pre_autograd_graph_module(
+                    test_input, {"input_pos": test_pos}
+                )
 
                 torch.allclose(compressed_output, reference_output)
 
@@ -120,15 +141,12 @@ def test_get_calibration_data_with_mock_module(self):
         mock_tokenizer = Mock()
         mock_tokenizer.eos_id = 2
         mock_tokenizer.encode = Mock(return_value=[1, 5, 6])
-        
+
         mock_module = Mock()
         mock_module.return_value = torch.tensor([[[0.1, 0.2, 0.9, 0.0]]])
-        
+
         result = get_calibration_data(
-            mock_module,
-            mock_tokenizer,
-            "test prompt", # Will be mocked
-            max_len=10
+            mock_module, mock_tokenizer, "test prompt", max_len=10  # Will be mocked
         )
 
         positions = [item[0] for item in result]

From 12efc7000dee1844740ac3eccf3261b7452f8399 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Thu, 22 Jan 2026 16:36:01 +0400
Subject: [PATCH 10/17] review changes

---
 .../openvino/quantizer/llm_compression.py     | 24 ++++++++++++++-----
 backends/openvino/tests/test_runner.py        |  3 ---
 examples/models/llama/export_llama_lib.py     |  1 -
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
index d077e53fd96..a7ca5ca8f09 100644
--- a/backends/openvino/quantizer/llm_compression.py
+++ b/backends/openvino/quantizer/llm_compression.py
@@ -19,7 +19,7 @@
     raise ImportError("Please install nncf via backends/openvino/requirements.txt")
 
 
-# This code is taken from https://github.com/pytorch/executorch/blob/0c54fd0483314da173f8e14d63d2ed9591c7133a/extension/llm/export/builder.py#L278
+# This code is adapted from https://github.com/pytorch/executorch/blob/0c54fd0483314da173f8e14d63d2ed9591c7133a/extension/llm/export/builder.py#L278
 def get_calibration_data(
     module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
 ):
@@ -30,7 +30,7 @@ def get_calibration_data(
     Currently, this method is only tested with Llama models.
     """
     # TODO: change criteria & support batch inputs if necessary
-    pos = torch.tensor(0, dtype=torch.int64)
+    pos = 0
     token_list = tokenizer.encode(prompts, bos=True, eos=False)
 
     with torch.no_grad():
@@ -44,7 +44,7 @@ def get_calibration_data(
                 token_list.append(torch.argmax(logits[:], dim=-1).item())
     token_list = [
         (
-            pos,
+            torch.tensor(pos, dtype=torch.int64),
             token,
         )
         for pos, token in enumerate(token_list)
@@ -95,7 +95,7 @@ def apply_nncf_data_aware_compression(
     ):
         tokenizer = get_tokenizer(builder_exported.tokenizer_path)
         nncf_calibration_data = nncf.Dataset(
-            get_calibration_data(
+            get_calibration_data(  # type: ignore[arg-type]
                 builder_exported.pre_autograd_graph_module,
                 tokenizer,
                 builder_exported.calibration_data,
@@ -106,8 +106,20 @@ def apply_nncf_data_aware_compression(
 
     # AWQ can work without a dataset as well.
     if scale_estimation and not nncf_calibration_data:
-        msg = "Scale Estimation is enabled but no calibration dataset is provided"
-        raise RuntimeError(msg)
+        missing_params = []
+        if builder_exported.calibration_data is None:
+            missing_params.append("calibration_data")
+        if builder_exported.calibration_seq_length is None:
+            missing_params.append("calibration_seq_length")
+        if builder_exported.tokenizer_path is None:
+            missing_params.append("tokenizer_path")
+        msg = "Scale Estimation is enabled but no calibration dataset is provided."
+        if missing_params:
+            msg += (
+                " Missing required calibration parameter(s): "
+                + ", ".join(missing_params)
+                + ". Please provide calibration_data, calibration_seq_length, and tokenizer_path."
+            )
 
     builder_exported.pre_autograd_graph_module = (
         nncf.experimental.torch.fx.compress_pt2e(
diff --git a/backends/openvino/tests/test_runner.py b/backends/openvino/tests/test_runner.py
index bf744debd14..7d8c6b968c2 100644
--- a/backends/openvino/tests/test_runner.py
+++ b/backends/openvino/tests/test_runner.py
@@ -1,8 +1,6 @@
 import argparse
 import unittest
 
-import nncf.torch  # type: ignore[import-untyped,import-not-found]
-
 
 class OpenvinoTestSuite(unittest.TestSuite):
 
@@ -68,7 +66,6 @@ def parse_arguments():
     # Discover all existing op tests in "ops" folder
     suite = loader.discover(test_params["test_type"], pattern=test_params["pattern"])
     # Start running tests
-    # with nncf.torch.disable_patching():
     result = unittest.TextTestRunner().run(suite)
     if result.wasSuccessful():
         print("OpenVINO backend tests completed successfully")
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 87c87972d9a..0b8440440a1 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -241,7 +241,6 @@ def build_args_parser() -> argparse.ArgumentParser:
 
     parser.add_argument(
         "--openvino_awq",
-        required=False,
         action="store_true",
         help="Whether to use AWQ from NNCF. Applicable only for the OpenVINO backend.",
     )

From 1236dfcd6b1cef88ebe4e5dc4a63aa2f8ea98bc6 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Thu, 22 Jan 2026 16:36:27 +0400
Subject: [PATCH 11/17] comprae reference scale values in tests

---
 .../tests/quantizer/synthetic_test_models.py  |   4 +-
 .../tests/quantizer/test_llm_compression.py   | 136 +++++++++++++-----
 2 files changed, 106 insertions(+), 34 deletions(-)

diff --git a/backends/openvino/tests/quantizer/synthetic_test_models.py b/backends/openvino/tests/quantizer/synthetic_test_models.py
index 9e665fbdb3c..6c7e91c5539 100644
--- a/backends/openvino/tests/quantizer/synthetic_test_models.py
+++ b/backends/openvino/tests/quantizer/synthetic_test_models.py
@@ -1,8 +1,8 @@
 import torch
 
 
-class SimpleTransformer(torch.nn.Module):
-    def __init__(self, vocab_size=100, hidden_size=64, num_layers=2):
+class ExportLlamaTestModel(torch.nn.Module):
+    def __init__(self, vocab_size=5, hidden_size=2, num_layers=1):
         super().__init__()
         self.embed = torch.nn.Embedding(vocab_size, hidden_size)
         self.layers = torch.nn.ModuleList(
diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py
index 6d4fbfb2492..252f9cd6875 100644
--- a/backends/openvino/tests/quantizer/test_llm_compression.py
+++ b/backends/openvino/tests/quantizer/test_llm_compression.py
@@ -7,17 +7,17 @@
 from executorch.backends.openvino.quantizer.llm_compression import (
     apply_nncf_data_aware_compression,
     get_calibration_data,
+    transform_fn,
 )
 from executorch.extension.llm.export.builder import LLMEdgeManager
-from synthetic_test_models import SimpleTransformer  # type: ignore[import-not-found]
+from synthetic_test_models import ExportLlamaTestModel  # type: ignore[import-not-found]
 
 
 class TestWeightsOnlyQuantization(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        cls.model_name = "llama"
-        cls.model_class_name = "Llama2Model"
-        cls.model = SimpleTransformer()
+        torch.manual_seed(42)
+        cls.model = ExportLlamaTestModel(vocab_size=5, hidden_size=2, num_layers=1)
         cls.model.eval()
 
         cls.max_seq_len = 128
@@ -51,6 +51,61 @@ def setUpClass(cls):
 
         cls.calibration_data = "The quick brown fox jumps over the lazy dog."
 
+        cls.reference_scales = {
+            "awq_only": {
+                "symmetric_weights_decompressor_embed_weight._scale": torch.tensor(
+                    [[-0.042084], [-0.029312], [0.140381], [-0.276123], [-0.057709]],
+                    dtype=torch.float16,
+                ),
+                "symmetric_weights_decompressor_layers_0_weight._scale": torch.tensor(
+                    [[0.040710], [-0.058624]], dtype=torch.float16
+                ),
+                "relu/awq_mul._scale_value": torch.tensor([[[1.0, 1.0]]]),
+                "symmetric_weights_decompressor_lm_head_weight_updated_constant0._scale": torch.tensor(
+                    [[0.053131], [0.087280], [-0.079834], [-0.068237], [-0.054626]],
+                    dtype=torch.float16,
+                ),
+            },
+            "scale_estimation_only": {
+                "symmetric_weights_decompressor_embed_weight._scale": torch.tensor(
+                    [[-0.042084], [-0.029312], [0.140381], [-0.276123], [-0.057709]],
+                    dtype=torch.float16,
+                ),
+                "symmetric_weights_decompressor_layers_0_weight._scale": torch.tensor(
+                    [[0.040710], [-0.057709]], dtype=torch.float16
+                ),
+                "symmetric_weights_decompressor_lm_head_weight._scale": torch.tensor(
+                    [[0.0], [0.0], [-0.0], [-0.0], [-0.0]], dtype=torch.float16
+                ),
+            },
+            "awq_and_scale_estimation": {
+                "symmetric_weights_decompressor_embed_weight._scale": torch.tensor(
+                    [[-0.042084], [-0.029312], [0.140381], [-0.276123], [-0.057709]],
+                    dtype=torch.float16,
+                ),
+                "symmetric_weights_decompressor_layers_0_weight._scale": torch.tensor(
+                    [[0.040710], [-0.057709]], dtype=torch.float16
+                ),
+                "relu/awq_mul._scale_value": torch.tensor([[[1.0, 1.0]]]),
+                "symmetric_weights_decompressor_lm_head_weight_updated_constant0._scale": torch.tensor(
+                    [[0.0], [0.0], [-0.0], [-0.0], [-0.0]], dtype=torch.float16
+                ),
+            },
+            "no_calibration": {
+                "symmetric_weights_decompressor_embed_weight._scale": torch.tensor(
+                    [[-0.042084], [-0.029312], [0.140381], [-0.276123], [-0.057709]],
+                    dtype=torch.float16,
+                ),
+                "symmetric_weights_decompressor_layers_0_weight._scale": torch.tensor(
+                    [[0.040710], [-0.058624]], dtype=torch.float16
+                ),
+                "symmetric_weights_decompressor_lm_head_weight._scale": torch.tensor(
+                    [[0.053131], [0.087280], [-0.079834], [-0.068237], [-0.054626]],
+                    dtype=torch.float16,
+                ),
+            },
+        }
+
     def _create_builder(self, config_name, calibration_data=None):
         builder_kwargs = {
             "model": self.model,
@@ -66,12 +121,29 @@ def _create_builder(self, config_name, calibration_data=None):
                 {
                     "calibration_seq_length": 32,
                     "calibration_data": calibration_data,
-                    "tokenizer_path": "dummy_path",  # Will be mocked
+                    "tokenizer_path": "dummy_path",
                 }
             )
 
         return LLMEdgeManager(**builder_kwargs)
 
+    def _extract_scales_from_model(self, model):
+        extracted_scales = {}
+        state_dict = dict(model.state_dict())
+        for name, _ in state_dict.items():
+            if "_scale" in name.lower():
+                extracted_scales[name] = state_dict[name]
+        return extracted_scales
+
+    def _compare_scales(self, extracted_scales, reference_scales):
+        for name, reference_value in reference_scales.items():
+            self.assertIn(name, extracted_scales, f"Scale {name} not found in model")
+            extracted_value = extracted_scales[name]
+            self.assertTrue(
+                torch.allclose(extracted_value, reference_value),
+                f"Scale {name} mismatch {extracted_value}",
+            )
+
     @patch("executorch.backends.openvino.quantizer.llm_compression.get_tokenizer")
     @patch(
         "executorch.backends.openvino.quantizer.llm_compression.get_calibration_data"
@@ -79,18 +151,7 @@ def _create_builder(self, config_name, calibration_data=None):
     def test_compression_flow_with_mocked_calibration(
         self, mock_get_calibration_data, mock_get_tokenizer
     ):
-        mock_calibration_data = [
-            (0, 1),
-            (1, 5),
-            (2, 10),
-            (3, 15),
-            (4, 20),
-            (5, 25),
-            (6, 30),
-            (7, 35),
-            (8, 40),
-            (9, 45),
-        ]
+        mock_calibration_data = [(i, i) for i in range(5)]
         mock_get_calibration_data.return_value = mock_calibration_data
 
         mock_tokenizer = Mock()
@@ -108,16 +169,13 @@ def test_compression_flow_with_mocked_calibration(
                     config["name"], calibration_data=calibration_data
                 )
                 builder.export()
-                import copy
 
-                original_model = copy.deepcopy(builder.pre_autograd_graph_module)
-
-                test_input = torch.tensor([[5]], dtype=torch.long)
+                test_input = torch.tensor([[4]], dtype=torch.long)
                 test_pos = torch.tensor([0], dtype=torch.long)
-                reference_output = original_model(test_input, {"input_pos": test_pos})
-
+                # Quantize weights for all layers(including embedding and lm_head which would by default be in INT8)
+                # to Per-Channel INT4 Symmetric
                 quantizer = OpenVINOQuantizer(
-                    mode=QuantizationMode.INT4WO_SYM, group_size=-1
+                    mode=QuantizationMode.INT4WO_SYM, group_size=-1, all_layers=True
                 )
                 builder = apply_nncf_data_aware_compression(
                     builder,
@@ -125,17 +183,18 @@ def test_compression_flow_with_mocked_calibration(
                     awq=config["awq"],
                     scale_estimation=config["scale_estimation"],
                 )
-
-                compressed_output = builder.pre_autograd_graph_module(
-                    test_input, {"input_pos": test_pos}
+                # Run the model to check it is performant
+                builder.pre_autograd_graph_module(test_input, {"input_pos": test_pos})
+                extracted_scales = self._extract_scales_from_model(
+                    builder.pre_autograd_graph_module
+                )
+                self._compare_scales(
+                    extracted_scales,
+                    self.reference_scales[config["name"]],
                 )
-
-                torch.allclose(compressed_output, reference_output)
 
 
 class TestCalibrationDataGeneration(unittest.TestCase):
-    """Test the calibration data generation method. We first create a mock tokenizer
-    and then compare it with a reference created manually"""
 
     def test_get_calibration_data_with_mock_module(self):
         mock_tokenizer = Mock()
@@ -146,8 +205,21 @@ def test_get_calibration_data_with_mock_module(self):
         mock_module.return_value = torch.tensor([[[0.1, 0.2, 0.9, 0.0]]])
 
         result = get_calibration_data(
-            mock_module, mock_tokenizer, "test prompt", max_len=10  # Will be mocked
+            mock_module, mock_tokenizer, "test prompt", max_len=10
         )
 
         positions = [item[0] for item in result]
         self.assertEqual(positions, list(range(len(positions))))
+
+    def test_transform_fn(self):
+        token_pos_map = (5, 10)
+        result = transform_fn(token_pos_map)
+
+        self.assertIsInstance(result, tuple)
+        self.assertEqual(len(result), 2)
+
+        token, input_pos_dict = result
+        self.assertEqual(token.shape, torch.Size([1, 1]))
+        self.assertEqual(token, torch.tensor([[10]]))
+        self.assertIn("input_pos", input_pos_dict)
+        self.assertEqual(input_pos_dict["input_pos"], torch.tensor([5]))

From 019b2cce0b9a8a5ce2ccf1e3e7954f0f6161ea22 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Thu, 22 Jan 2026 16:47:05 +0400
Subject: [PATCH 12/17] remove dead code

---
 backends/openvino/quantizer/llm_compression.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
index a7ca5ca8f09..c4862161aa0 100644
--- a/backends/openvino/quantizer/llm_compression.py
+++ b/backends/openvino/quantizer/llm_compression.py
@@ -113,7 +113,6 @@ def apply_nncf_data_aware_compression(
             missing_params.append("calibration_seq_length")
         if builder_exported.tokenizer_path is None:
             missing_params.append("tokenizer_path")
-        msg = "Scale Estimation is enabled but no calibration dataset is provided."
         if missing_params:
             msg += (
                 " Missing required calibration parameter(s): "

From 562261f0bab0ce86e6f527616564a1788f4f7944 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 3 Feb 2026 14:35:12 +0400
Subject: [PATCH 13/17] lint fixes

---
 backends/openvino/quantizer/llm_compression.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
index c4862161aa0..86b45d55ff2 100644
--- a/backends/openvino/quantizer/llm_compression.py
+++ b/backends/openvino/quantizer/llm_compression.py
@@ -95,8 +95,8 @@ def apply_nncf_data_aware_compression(
     ):
         tokenizer = get_tokenizer(builder_exported.tokenizer_path)
         nncf_calibration_data = nncf.Dataset(
-            get_calibration_data(  # type: ignore[arg-type]
-                builder_exported.pre_autograd_graph_module,
+            get_calibration_data(
+                builder_exported.pre_autograd_graph_module,  # type: ignore[arg-type]
                 tokenizer,
                 builder_exported.calibration_data,
                 builder_exported.calibration_seq_length,
@@ -114,11 +114,12 @@ def apply_nncf_data_aware_compression(
         if builder_exported.tokenizer_path is None:
             missing_params.append("tokenizer_path")
         if missing_params:
-            msg += (
+            msg = (
                 " Missing required calibration parameter(s): "
                 + ", ".join(missing_params)
                 + ". Please provide calibration_data, calibration_seq_length, and tokenizer_path."
             )
+            raise ValueError(msg)
 
     builder_exported.pre_autograd_graph_module = (
         nncf.experimental.torch.fx.compress_pt2e(

From ecd5b8a85d353658d867fa62e21a3a721c13e522 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 3 Feb 2026 14:35:28 +0400
Subject: [PATCH 14/17] extend test for error

---
 .../tests/quantizer/test_llm_compression.py   | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py
index 252f9cd6875..e06a750a0f9 100644
--- a/backends/openvino/tests/quantizer/test_llm_compression.py
+++ b/backends/openvino/tests/quantizer/test_llm_compression.py
@@ -193,6 +193,28 @@ def test_compression_flow_with_mocked_calibration(
                     self.reference_scales[config["name"]],
                 )
 
+    def test_scale_estimation_requires_calibration_params(self):
+        builder = self._create_builder("missing_calibration_data", calibration_data=None)
+        builder.export()
+
+        quantizer = OpenVINOQuantizer(
+            mode=QuantizationMode.INT4WO_SYM, group_size=-1, all_layers=True
+        )
+
+        with self.assertRaises(ValueError) as cm:
+            apply_nncf_data_aware_compression(
+                builder,
+                quantizer=quantizer,
+                awq=False,
+                scale_estimation=True,
+            )
+
+        err = str(cm.exception)
+        self.assertIn("Missing required calibration parameter(s)", err)
+        self.assertIn("calibration_data", err)
+        self.assertIn("calibration_seq_length", err)
+        self.assertIn("tokenizer_path", err)
+
 
 class TestCalibrationDataGeneration(unittest.TestCase):
 

From d72466d8c8e5d79729d26146696cb7cb20c3438f Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 3 Feb 2026 14:49:40 +0400
Subject: [PATCH 15/17] lint

---
 backends/openvino/tests/quantizer/test_llm_compression.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/tests/quantizer/test_llm_compression.py b/backends/openvino/tests/quantizer/test_llm_compression.py
index e06a750a0f9..6dfef1fb600 100644
--- a/backends/openvino/tests/quantizer/test_llm_compression.py
+++ b/backends/openvino/tests/quantizer/test_llm_compression.py
@@ -194,7 +194,9 @@ def test_compression_flow_with_mocked_calibration(
                 )
 
     def test_scale_estimation_requires_calibration_params(self):
-        builder = self._create_builder("missing_calibration_data", calibration_data=None)
+        builder = self._create_builder(
+            "missing_calibration_data", calibration_data=None
+        )
         builder.export()
 
         quantizer = OpenVINOQuantizer(

From 83f0fb8910e757746dd250128edd47dd4a1883d5 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Wed, 4 Feb 2026 11:36:53 +0400
Subject: [PATCH 16/17] remove leading space in error message

---
 backends/openvino/quantizer/llm_compression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/quantizer/llm_compression.py b/backends/openvino/quantizer/llm_compression.py
index 86b45d55ff2..1737f638bf9 100644
--- a/backends/openvino/quantizer/llm_compression.py
+++ b/backends/openvino/quantizer/llm_compression.py
@@ -115,7 +115,7 @@ def apply_nncf_data_aware_compression(
             missing_params.append("tokenizer_path")
         if missing_params:
             msg = (
-                " Missing required calibration parameter(s): "
+                "Missing required calibration parameter(s): "
                 + ", ".join(missing_params)
                 + ". Please provide calibration_data, calibration_seq_length, and tokenizer_path."
             )

From 0093592df4a3f7cedf803207c656e74b6e8ee7d6 Mon Sep 17 00:00:00 2001
From: Aamir Nazir <aamir.nazir@intel.com>
Date: Tue, 24 Feb 2026 18:22:56 +0400
Subject: [PATCH 17/17] update nncf version to 3.0.0

---
 backends/openvino/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index 814b0d68fff..ba338416583 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1 +1 @@
-git+https://github.com/openvinotoolkit/nncf@88445b32d6a622e177945c331beef95e222dbe00
+nncf==3.0.0