add awq and scale estimation

cavusmustafa · anzr299 · Sep 24, 2025 · Oct 14, 2025 · Oct 15, 2025 · Oct 20, 2025
commit e0010bb31a5dee0f7208c1c29e30b101a2b0c343
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
@@ -193,6 +193,8 @@ def _annotate_weight_compression(
         )
 
         for wc_param in all_wc_params:
+            if not wc_param.compression_config:
+                continue
             node_with_weight = wc_param.node_with_weight
             target_node = nncf_fx.node_utils.get_graph_node_by_name(
                 graph, node_with_weight.node_name

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -253,6 +253,19 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="Path to the adapter_config.json file. Used if the model has trained LoRA adapters. Must provide adapter_checkpoint.",
     )
 
+    parser.add_argument(
+        "--nncf_awq",
+        required=False,
+        action="store_true",
+        help="Whether to use AWQ from NNCF. Applicable only for the Openvino backend.",
+    )
+
+    parser.add_argument(
+        "--nncf_scale_estimation",
+        action="store_true",
+        help="Whether to use Scale Estimation algorithm from NNCF. Applicable only for the Openvino backend",
+    )
+
     parser.add_argument(
         "--use_qnn_sha",
         action="store_true",
@@ -912,6 +925,8 @@ def _to_edge_and_lower_llama_openvino(
     modelname,
     quantizers,
     additional_passes,
+    awq,
+    scale_estimation,
     openvino_device: str = "CPU",
     verbose: bool = False,
 ) -> LLMEdgeManager:  # noqa: C901
@@ -925,9 +940,63 @@ def _to_edge_and_lower_llama_openvino(
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
 
-    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
-        partitioners
-    )
+    if awq or scale_estimation:
+        try:
+            import nncf
+            from pytorch_tokenizers import get_tokenizer
+        except ImportError:
+            raise ImportError(
+                "Please install nncf via backends/openvino/requirements.txt"
+            )
+        tokenizer = get_tokenizer(builder_exported.tokenizer_path)
+        def get_calibration_data(
+            module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
+        ):
+            # TODO: change criteria & support batch inputs if necessary
+            pos = torch.tensor(0, dtype=torch.int64)
+            token_list = tokenizer.encode(prompts, bos=True, eos=False)
+
+            with torch.no_grad():
+                while token_list[-1] != tokenizer.eos_id and pos < max_len:
+                    logits = module(
+                        torch.full((1, 1), token_list[pos]),
+                        {"input_pos": torch.tensor((pos,))},
+                    )
+                    pos += 1
+                    if pos >= len(token_list):
+                        token_list.append(torch.argmax(logits[:], dim=-1).item())
+            token_list = [(pos, token,) for pos, token in enumerate(token_list)]
+            return token_list
+
+        def transform_fn(curr_token: str, pos: int):
+            # tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
+            inputs = ()
+            inputs = (
+                torch.tensor(curr_token).unsqueeze(0),
+                {"input_pos": torch.tensor([pos])},
+            )
+
+            return inputs
+
+        builder_exported.calibration_data = get_calibration_data(builder_exported.pre_autograd_graph_module, tokenizer, builder_exported.calibration_data, builder_exported.max_seq_len)
+
+        builder_exported.pre_autograd_graph_module = nncf.compress_pt2e(
+            builder_exported.pre_autograd_graph_module,
+            dataset=nncf.Dataset(
+                builder_exported.calibration_data,
+                transform_func=transform_fn,
+            ),
+            mode=nncf.CompressWeightsMode.INT4_SYM,
+            awq=awq,
+            scale_estimation=scale_estimation,
+        )
+        builder = builder_exported.to_edge_transform_and_lower(
+            partitioners
+        )
+    else:
+        builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
+            partitioners
+        )
 
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
@@ -1185,6 +1254,8 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             modelname,
             quantizers,
             additional_passes,
+            awq=llm_config.backend.openvino.awq,
+            scale_estimation=llm_config.backend.openvino.scale_estimation,
             openvino_device=llm_config.backend.openvino.device,
             verbose=llm_config.debug.verbose,
         )

diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
@@ -466,6 +466,8 @@ class OpenvinoConfig:
     device: str = "CPU"
     nncf_compression: bool = False
     nncf_compression_group_size: int = 32
+    awq: bool = False
+    scale_estimation: bool = False
 
 
 @dataclass
@@ -667,8 +669,10 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.backend.openvino.enabled = args.openvino
         if hasattr(args, "openvino_device"):
             llm_config.backend.openvino.device = args.openvino_device
-        if hasattr(args, "nncf_compression"):
-            llm_config.backend.openvino.nncf_compression = args.nncf_compression
+        if hasattr(args, "nncf_awq"):
+            llm_config.backend.openvino.nncf_awq = args.nncf_awq
+        if hasattr(args, "nncf_scale_estimation"):
+            llm_config.backend.openvino.nncf_scale_estimation = args.nncf_scale_estimation
         if hasattr(args, "group_size") and args.group_size:
             llm_config.backend.openvino.nncf_compression_group_size = args.group_size