ModelTC · gushiqiao · Mar 30, 2026 · Mar 27, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,5 @@ save*
 .log
 *.pid
 *.ipynb*
+.venv/
+*.sh
diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml
@@ -0,0 +1,48 @@
+base:
+    seed: &seed 42
+model:
+    type: model_type
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: txt_general_preproc
+    seed: *seed
+eval:
+    eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos
+    name: wikitext2
+    type: decode_ppl
+    download: False
+    path: eval_data_path
+    bs: 1
+    inference_per_block: False
+    num_samples: 10
+    # num_eval_tokens: 3
+quant:
+    method: RTN
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_tensor
+        static: True
+    kvcache:
+        method: Naive
+        bit: 8
+        symmetric: True
+        granularity: per_head
+        head_num: kv head num
+save:
+    save_lightllm_kv_calib: True
+    lightllm_kv_cache_name: kv_cache_calib.json
+    save_fake: False
+    save_path: /path/to/save/
diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml
@@ -41,5 +41,7 @@ quant:
         symmetric: True
         granularity: per_tensor
 save:
+    save_lightllm_kv_calib: True
+    lightllm_kv_cache_name: kv_cache_calib.json
     save_fake: False
-    save_path: /path/to/save/
+    save_path: /path/to/save/
diff --git a/llmc/__main__.py b/llmc/__main__.py
@@ -72,6 +72,21 @@ def main(config):
 
     eval_model(model, blockwise_opts, eval_list, eval_pos='transformed')
     if int(os.environ['RANK']) == 0:
+        if 'save' in config and config.save.get('save_lightllm_kv_cache_calib', False):
+            calib_json_list = [
+                blockwise_opt.collect_calib_json()
+                for blockwise_opt in blockwise_opts
+                if hasattr(blockwise_opt, 'collect_calib_json')
+            ]
+            calib_json_payload = (
+                calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list
+            )
+            with open(save_lightllm_kv_cache_calib_path, 'w') as file:
+                json.dump(calib_json_payload, file, ensure_ascii=False, indent=4)
+            logger.info(
+                f'save lightllm kv cache calib done -- {save_lightllm_kv_cache_calib_path}'
+            )
+
         if 'save' in config and config.save.get('save_trans', False):
             blockwise_opt.save_model(save_trans_path)
 
@@ -209,6 +224,14 @@ def main(config):
     # Ensure only the main process creates directories
     if int(os.environ['RANK']) == 0:
         if 'save' in config:
+            if config.save.get('save_lightllm_kv_cache_calib', False):
+                mkdirs(config.save.save_path)
+                save_lightllm_kv_cache_calib_path = os.path.join(
+                    config.save.save_path,
+                    config.save.get(
+                        'lightllm_kv_cache_calib_name', 'kv_cache_calib.json'
+                    ),
+                )
             if config.save.get('save_trans', False):
                 save_trans_path = os.path.join(
                     config.save.save_path, 'transformed_model'
@@ -266,3 +289,4 @@ def main(config):
     llmc_duration_time = llmc_end_time - llmc_start_time
     logger.info(f'llmc_duration_time: {llmc_duration_time} s')
     logger.info('--- llmc finished ---')
+
diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py
@@ -12,6 +12,7 @@
 import torch.nn as nn
 from loguru import logger
 
+from llmc.utils.export_calib import collect_lightllm_kv_calib_json
 from llmc.utils.registry_factory import KV_REGISTRY, TOKEN_REDUCTION_REGISTRY
 
 from ..blockwise_optimization import BlockwiseOpt
@@ -175,13 +176,18 @@ def set_quant_config(self):
                     self.act_quant_module = IntegerQuantizer
             elif quant_type == 'float-quant':
                 self.act_quant_module = FloatQuantizer
-            self.quant_config['act']['tp'] = self.tp
-            self.aquantizer = self.act_quant_module(**self.quant_config['act'])
             self.act_static = self.quant_config['act'].get('static', False)
             if self.act_static:
                 assert (
                     self.quant_config['act']['granularity'] == 'per_tensor'
                 ), 'Only support per_tensor static quant'
+                # Static activation quantization uses the batched calibration
+                # path, so normalize the default minmax setting to
+                # static_minmax to match the downstream calibration logic.
+                if self.quant_config['act'].get('calib_algo', 'minmax') == 'minmax':
+                    self.quant_config['act']['calib_algo'] = 'static_minmax'
+            self.quant_config['act']['tp'] = self.tp
+            self.aquantizer = self.act_quant_module(**self.quant_config['act'])
             self.quant_attn = self.quant_config['act'].get('quant_attn', False)
             if self.quant_attn:
                 assert self.config['model']['type'] in ['Vit', 'DeepseekV2']
@@ -203,8 +209,10 @@ def set_quant_config(self):
             kv_special_cfg = self.quant_config['kvcache'].get('special', {})
             act_static_cfg = {}
             if self.act_static:
-                act_static_cfg.update(self.config.calib.n_sample)
-                act_static_cfg.update(self.config.calib.bs)
+                # The KV cache constructor expects num_samples / bsz, so map
+                # the calibration config fields to the parameter names it uses.
+                act_static_cfg['num_samples'] = self.config.calib.n_samples
+                act_static_cfg['bsz'] = self.config.calib.bs
             kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant')
             self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']](
                 kv_quant_type, self.quant_config['kvcache'],
@@ -1003,6 +1011,45 @@ def contiguous_params(self):
                 if not param.is_contiguous():
                     param.data = param.data.contiguous()
 
+    # Convert tensors and similar objects into Python values that can be
+    # directly serialized into JSON.
+    def _to_jsonable(self, value):
+        if isinstance(value, torch.Tensor):
+            return value.detach().cpu().tolist()
+        return value
+
+    # Normalize inputs into CPU tensors so the following range computation
+    # and serialization logic can handle them consistently.
+    def _to_tensor(self, value, dtype=torch.float32):
+        if isinstance(value, torch.Tensor):
+            return value.detach().cpu().to(dtype)
+        return torch.as_tensor(value, dtype=dtype)
+
+    # LightLLM expects offline FP8 KV descales. Recover the real-value range
+    # from the qparams first, then convert it into per-layer K/V scales that
+    # align with torch.float8_e4m3fn.
+    def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax):
+        if isinstance(scales, torch.Tensor) and scales.numel() == 0:
+            return None
+
+        scales_tensor = self._to_tensor(scales)
+        zeros_tensor = self._to_tensor(zeros, dtype=scales_tensor.dtype)
+        qmin_tensor = self._to_tensor(qmin, dtype=scales_tensor.dtype)
+        qmax_tensor = self._to_tensor(qmax, dtype=scales_tensor.dtype)
+        min_tensor = (qmin_tensor - zeros_tensor) * scales_tensor
+        max_tensor = (qmax_tensor - zeros_tensor) * scales_tensor
+        absmax_tensor = torch.maximum(min_tensor.abs(), max_tensor.abs())
+        fp8_qmax = torch.tensor(
+            torch.finfo(torch.float8_e4m3fn).max, dtype=absmax_tensor.dtype
+        )
+        return absmax_tensor / fp8_qmax
+
+    # Export calibration results in the LightLLM kv_cache_calib.json format.
+    # At the moment, only the per_tensor and per_head KV formats supported by
+    # LightLLM are handled here.
+    def collect_calib_json(self):
+        return collect_lightllm_kv_calib_json(self)
+
     @torch.no_grad()
     def save_model(self, path):
         if int(os.environ['RANK']) != 0:

diff --git a/llmc/compression/quantization/kvquant.py b/llmc/compression/quantization/kvquant.py
@@ -1,3 +1,4 @@
+import copy
 import torch
 from loguru import logger
 from transformers import DynamicCache
@@ -12,12 +13,20 @@ class NaiveQuantKVCache(DynamicCache):
     def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1):
         super().__init__()
 
-        assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group']
+        # Copy the config to avoid mutating the original quantization config in static KV calibration.
+        kvquant_cfg = copy.deepcopy(kvquant_cfg)
+        assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group', 'per_head']
         self.num_hidden_layers, self.num_samples, self.bsz = (
             num_hidden_layers,
             num_samples,
             bsz,
         )
+        if kvquant_cfg.get('static', False) and kvquant_cfg.get(
+            'calib_algo', 'minmax'
+        ) == 'minmax':
+            # Static KV calibration uses the batched tensor statistics path, so convert the default
+            # minmax setting to static_minmax here to avoid a later calibration algo name mismatch.
+            kvquant_cfg['calib_algo'] = 'static_minmax'
         if quant_type == 'int-quant':
             self.kvquantizer = IntegerQuantizer(**kvquant_cfg)
         elif quant_type == 'float-quant':

diff --git a/llmc/utils/__init__.py b/llmc/utils/__init__.py
@@ -1,4 +1,5 @@
 from .export_autoawq import update_autoawq_quant_config
+from .export_calib import collect_lightllm_kv_calib_json
 from .export_lightx2v import update_lightx2v_quant_config
 from .export_vllm import update_vllm_quant_config
 from .utils import (check_config, copy_files, deploy_all_modality,

diff --git a/llmc/utils/export_calib.py b/llmc/utils/export_calib.py
@@ -0,0 +1,69 @@
+import torch
+
+
+def collect_lightllm_kv_calib_json(blockwise_opt):
+    if not getattr(blockwise_opt, 'quant_kvcache', False):
+        raise ValueError(
+            'save_lightllm_kv_cache_calib requires kvcache quantization.'
+        )
+
+    kv_cfg = blockwise_opt.quant_config['kvcache']
+    granularity = kv_cfg.get('granularity')
+    if granularity not in ['per_tensor', 'per_head']:
+        raise ValueError(
+            f'LightLLM calib export only supports per_tensor/per_head, got {granularity}'
+        )
+
+    num_layers = blockwise_opt.model.model_config.num_hidden_layers
+    num_head = int(
+        getattr(
+            blockwise_opt.model.model_config,
+            'num_key_value_heads',
+            blockwise_opt.model.get_num_attention_heads(),
+        )
+    )
+    scales = []
+    for layer_idx in range(num_layers):
+        key_scale = blockwise_opt._collect_lightllm_kv_scale(
+            blockwise_opt.kv_module.k_scales_buffer[layer_idx],
+            blockwise_opt.kv_module.k_zeros_buffer[layer_idx],
+            blockwise_opt.kv_module.k_qmin_buffer[layer_idx],
+            blockwise_opt.kv_module.k_qmax_buffer[layer_idx],
+        )
+        value_scale = blockwise_opt._collect_lightllm_kv_scale(
+            blockwise_opt.kv_module.v_scales_buffer[layer_idx],
+            blockwise_opt.kv_module.v_zeros_buffer[layer_idx],
+            blockwise_opt.kv_module.v_qmin_buffer[layer_idx],
+            blockwise_opt.kv_module.v_qmax_buffer[layer_idx],
+        )
+        if key_scale is None or value_scale is None:
+            raise ValueError(f'Calibration scale for layer {layer_idx} is empty.')
+
+        scale_row = torch.cat([key_scale.reshape(-1), value_scale.reshape(-1)]).tolist()
+        scales.append(scale_row)
+
+    scale_width = len(scales[0]) if scales else 0
+    if granularity == 'per_tensor' and scale_width != 2:
+        raise ValueError(f'per_tensor export expects 2 scales per layer, got {scale_width}')
+    if granularity == 'per_head' and scale_width != num_head * 2:
+        raise ValueError(
+            f'per_head export expects {num_head * 2} scales per layer, got {scale_width}'
+        )
+
+    architectures = getattr(blockwise_opt.model.model_config, 'architectures', None)
+    if isinstance(architectures, list) and len(architectures) > 0:
+        architectures = architectures[0]
+    elif architectures is None:
+        architectures = blockwise_opt.config.model.type
+
+    return {
+        'version': '1.0',
+        'architectures': architectures,
+        'quant_type': granularity,
+        'qmin': float(torch.finfo(torch.float8_e4m3fn).min),
+        'qmax': float(torch.finfo(torch.float8_e4m3fn).max),
+        'num_layers': num_layers,
+        'num_head': num_head,
+        'scales_shape': [num_layers, scale_width],
+        'scales': scales,
+    }
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,3 +22,5 @@ save* @@
     .log
     *.pid
     *.ipynb*
+    .venv/
+    *.sh