add quant_nontext_module

Kaihui-intel · Kaihui-intel · commit ba46b2145b56 · 2025-02-10T16:27:07.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;
diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -84,7 +84,7 @@ def __init__(
         enable_torch_compile: bool = None,
         # mllm
         is_mllm: bool = False,
-        quant_nontext_module: Union[str, list] = None,
+        quant_nontext_module: bool = False,
         extra_data_dir: str = None,
         image_processor=None,
         processor=None,
@@ -150,7 +150,7 @@ def __init__(
             act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
             enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning.
             enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer, torch>=2.6 True.
-            quant_nontext_module (Union[str, list]): Whether to quantize nontext module.
+            quant_nontext_module (bool): Whether to quantize nontext module.
             is_mllm (bool): Indicates whether the model to be quantized is a multi-modal model (MLLM).
             extra_data_dir (str): The path for extra data such as images, audio or videos.
             processor (transformers.AutoProcessor): Any multi-modal model will require an object to encode or
@@ -383,7 +383,7 @@ def get_mllm_dataloader(
         template, model=model, tokenizer=tokenizer, processor=processor, image_processor=image_processor
     )
     dataset = template.default_dataset if dataset is None else dataset
-    if quant_nontext_module or (dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer)):
+    if quant_nontext_module or (dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer, "cpu", template.model_type)):
         if quant_nontext_module:
             logger.warning(
                 "Quantitative nontext module is not supported for plain text datasets,"
@@ -399,7 +399,7 @@ def get_mllm_dataloader(
         truncation = False
         gradient_accumulate_steps = batch_size * gradient_accumulate_steps
         batch_size = 1
-
+        seed = 42  # The seed is fixed to 42 in transformers
     seqlen = 2048 if seqlen is None else seqlen  # set text only calibration default args
     truncation = True if truncation is None else truncation
     dataset = dataset.replace(" ", "")
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -950,7 +950,7 @@ def __init__(
         enable_torch_compile: bool = None,
         # mllm
         is_mllm: bool = False,
-        quant_nontext_module: Union[str, list] = None,
+        quant_nontext_module: bool = False,
         extra_data_dir: str = None,
         processor=None,
         image_processor=None,
@@ -994,7 +994,7 @@ def __init__(
             export_format (str, optional): The format used for exporting the quantized model. Defaults to "itrex".
             enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning.
             enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer, torch>=2.6 True.
-            quant_nontext_module (Union[str, list]): Whether to quantize nontext module.
+            quant_nontext_module (bool): Whether to quantize nontext module.
             extra_data_dir (str): The path for extra data such as images, audio or videos.
             is_mllm (bool): Indicates whether the model to be quantized is a multi-modal model (MLLM).
             processor (transformers.AutoProcessor): Any multi-modal model will require an object to encode or
diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py
@@ -18,6 +18,7 @@
 import math
 import os
 import types
+import re
 
 from datasets import load_dataset
 
@@ -40,6 +41,7 @@
 
 if is_package_available("auto_round"):
     import auto_round
+    import transformers
     from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woq_linear
     
 
@@ -132,18 +134,18 @@ def _replace_linear(
             isinstance(module, torch.nn.Linear)
             or isinstance(module, INCWeightOnlyLinear)
             or (is_package_available("auto_round") and isinstance(module, auto_round_woq_linear))
-            or (is_ipex_available() and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear))
         ) and (name not in modules_to_not_convert):
             # Check if the current key is not in the `modules_to_not_convert`
-            if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
+            if not any(key in ".".join(current_key_name) for key in modules_to_not_convert) and \
+                not any(re.match(pattern, ".".join(current_key_name)) for pattern in modules_to_not_convert):
                 in_features = module.in_features
                 out_features = module.out_features
                 if device == "cpu" or device == torch.device("cpu") or device == "auto":
                     from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear
                     from intel_extension_for_pytorch.utils.weight_only_quantization import (
                         _convert_optimum_format_to_desired,
                     )
-
+                        
                     qweight = module.qweight
                     scales = module.scales
                     qzeros = module.qzeros
@@ -550,7 +552,41 @@ def convert_to_quantized_model(model, config, device="cpu"):
             gradient_accumulate_steps=config.gradient_accumulate_steps,
             export_format=config.export_format,
         )
-
+        
+        # vlm set non-text module config
+        if config.is_vlm is True:
+            from neural_compressor.torch.utils.utility import (
+                get_multimodal_block_names,
+                find_matching_blocks,
+                get_layer_names_in_block,
+            )
+            def set_nontext_module_config(model, to_quant_block_names, config):
+                all_block_list = get_multimodal_block_names(model, quant_vision=True)
+                all_block_set = set(tuple(block) for block in all_block_list)
+                quant_block_set = set(tuple(block) for block in to_quant_block_names)
+                set_to_full_prec = list(all_block_set - quant_block_set)
+                set_to_full_prec = get_layer_names_in_block(model, to_quant_block_names=set_to_full_prec)
+                for name in set_to_full_prec:
+                    config.modules_to_not_convert.append(name)
+                    
+                # skip layers not in blocks
+                config.modules_to_not_convert.append("model.vision_embed_tokens.img_projection*")
+                config.modules_to_not_convert.append("transformer.visual.attn_pool.*_proj")
+                config.modules_to_not_convert.append("model.mm_projector*")
+                config.modules_to_not_convert.append("multi_modal_projector")
+                config.modules_to_not_convert.append("visual.merger")
+            
+            all_blocks = get_multimodal_block_names(model, quant_config.quant_nontext_module)
+            to_quant_block_names = find_matching_blocks(model, all_blocks, quant_config.to_quant_block_names)
+            set_nontext_module_config(model, to_quant_block_names, config)
+            
+            for n, m in model.named_modules():
+                if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
+                    if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+                        config.modules_to_not_convert.append(n)
+                        print(
+                            f"{n} will not be quantized due to its shape not being divisible by 32,"
+                            " resulting in an exporting issue to autogptq")
         if config.modules_to_not_convert != []:
             for module in config.modules_to_not_convert:
                 module_name = ".*" + module
diff --git a/neural_compressor/transformers/utils/quantization_config.py b/neural_compressor/transformers/utils/quantization_config.py
@@ -545,7 +545,7 @@ def __init__(
         quant_lm_head: bool = False,
         # vlm arguments
         is_vlm: bool = False,
-        quant_nontext_module: Union[str, list] = None,
+        quant_nontext_module: bool = False,
         truncation: bool = False,
         gradient_accumulate_steps: int = 1,
         export_format="itrex",
diff --git a/test/3x/torch/quantization/weight_only/test_transformers.py b/test/3x/torch/quantization/weight_only/test_transformers.py
@@ -249,6 +249,17 @@ def test_vlm(self):
         assert isinstance(loaded_model.model.layers[0].self_attn.k_proj, WeightOnlyQuantizedLinear), "loaing model failed."
         
         # phi-3-vision-128k-instruct
+        woq_config = AutoRoundConfig(
+            bits=4, 
+            group_size=128,
+            is_vlm=True,
+            dataset="NeelNanda/pile-10k", 
+            iters=2,
+            n_samples=5,
+            seq_len=64,
+            batch_size=1,
+        )
         model_name = "microsoft/Phi-3-vision-128k-instruct"
         woq_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True, attn_implementation='eager')
         assert isinstance(woq_model.model.layers[0].self_attn.o_proj, WeightOnlyQuantizedLinear), "quantizaion failed."
+