From a842ded98f4dfbda733b15b2866221cd15742b9c Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 22 Nov 2024 07:05:01 +0000 Subject: [PATCH 01/13] support layerwise quantization Signed-off-by: changwangss --- examples/neural_compressor/language-modeling/run_clm.py | 7 +++++++ tests/neural_compressor/test_optimization.py | 1 + 2 files changed, 8 insertions(+) diff --git a/examples/neural_compressor/language-modeling/run_clm.py b/examples/neural_compressor/language-modeling/run_clm.py index 7e81072194..6a46585295 100644 --- a/examples/neural_compressor/language-modeling/run_clm.py +++ b/examples/neural_compressor/language-modeling/run_clm.py @@ -215,6 +215,12 @@ class OptimizationArguments: default="sym", metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."}, ) + use_layer_wise: bool = field( + default=False, + metadata={ + "help": "Use layer wise to do quantization to save memory." + }, + ) quantization_methodology: str = field( default="rtn", metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."}, @@ -659,6 +665,7 @@ def compute_metrics(eval_preds): "bits": optim_args.bits, "sym": optim_args.weight_only_scheme == "sym", "group_size": optim_args.group_size, + "use_layer_wise": optim_args.use_layer_wise, } if optim_args.quantization_methodology == "gptq": diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 75f2845c78..72f19fdb20 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -489,6 +489,7 @@ def test_weight_only_quantization(self, methodology, bits): batch_size=5, seq_len=32, block_size=16, + user_layer_wise=True, ) else: quantization_config = RtnConfig(bits=bits, group_size=8) From 5f1465878c09d3c45e5c142168d27a2b06eb24e2 Mon Sep 17 00:00:00 2001 From: changwa1 Date: Fri, 22 Nov 2024 17:26:26 +0800 Subject: [PATCH 02/13] fix ut and example Signed-off-by: changwa1 --- examples/neural_compressor/language-modeling/run_clm.py | 5 ++--- optimum/intel/neural_compressor/quantization.py | 6 ++++++ tests/neural_compressor/test_optimization.py | 5 +++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/examples/neural_compressor/language-modeling/run_clm.py b/examples/neural_compressor/language-modeling/run_clm.py index 6a46585295..55f79b2185 100644 --- a/examples/neural_compressor/language-modeling/run_clm.py +++ b/examples/neural_compressor/language-modeling/run_clm.py @@ -217,9 +217,7 @@ class OptimizationArguments: ) use_layer_wise: bool = field( default=False, - metadata={ - "help": "Use layer wise to do quantization to save memory." - }, + metadata={"help": "Use layer wise to do quantization to save memory."}, ) quantization_methodology: str = field( default="rtn", @@ -673,6 +671,7 @@ def compute_metrics(eval_preds): damp_percent=optim_args.damp_percent, nsamples=optim_args.num_calibration_samples, blocksize=optim_args.gptq_block_size, + tokenizer=tokenizer, **algorithm_args, ) else: diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 6ca9fd661d..e0e141898f 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -398,6 +398,12 @@ def _weight_only_quantization( if (not torch.cuda.is_available() or device_map == "cpu") and model.config.model_type == "chatglm": model = model.float() + from neural_compressor.torch import load_empty_model + + model = load_empty_model( + model_id, + trust_remote_code=trust_remote_code, + ) model = convert_to_quantized_model(model, quantization_config, device=device_map) quantization_config.remove_redundant_parameters() model.config.quantization_config = quantization_config diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 72f19fdb20..7d54214ca7 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -489,10 +489,10 @@ def test_weight_only_quantization(self, methodology, bits): batch_size=5, seq_len=32, block_size=16, - user_layer_wise=True, + use_layer_wise=True, ) else: - quantization_config = RtnConfig(bits=bits, group_size=8) + quantization_config = RtnConfig(bits=bits, group_size=8, use_layer_wise=True) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.add_special_tokens({"pad_token": "[PAD]"}) @@ -504,6 +504,7 @@ def test_weight_only_quantization(self, methodology, bits): with torch.no_grad(): quantizer_outputs = quantized_model(**tokens) quantized_model.save_pretrained(tmp_dir) + loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir) with torch.no_grad(): loaded_outputs = loaded_model(**tokens) From 9e0bb7c6d85b9a28574438cbebc354a58d64ffb7 Mon Sep 17 00:00:00 2001 From: changwa1 Date: Tue, 26 Nov 2024 15:02:47 +0800 Subject: [PATCH 03/13] improve model init Signed-off-by: changwa1 --- .../intel/neural_compressor/quantization.py | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index e0e141898f..b0abeed05a 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -374,22 +374,33 @@ def _weight_only_quantization( } low_cpu_mem_usage = True + if use_xpu: - try: - # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. - model = model_class.from_pretrained( - model_id, low_cpu_mem_usage=low_cpu_mem_usage, device_map="cpu", **loading_kwargs - ) - except NotImplementedError: - logger.info( - "Failed to load models with `low_cpu_mem_usage=True`, will fall to traditional load method resulting in higher memory consumption." - ) - low_cpu_mem_usage = False - model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) + if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: + from neural_compressor.torch import load_empty_model + + model = load_empty_model(model_id, **loading_kwargs) + else: + try: + # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. + model = model_class.from_pretrained( + model_id, low_cpu_mem_usage=low_cpu_mem_usage, device_map="cpu", **loading_kwargs + ) + except NotImplementedError: + logger.info( + "Failed to load models with `low_cpu_mem_usage=True`, will fall to traditional load method resulting in higher memory consumption." + ) + low_cpu_mem_usage = False + model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) quantization_config.update(**{"device": "xpu"}) quantization_config.post_init_xpu() else: - model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) + if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: + from neural_compressor.torch import load_empty_model + + model = load_empty_model(model_id, **loading_kwargs) + else: + model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) quantization_config.post_init_cpu() model.config.update({"low_cpu_mem_usage": low_cpu_mem_usage}) @@ -398,12 +409,6 @@ def _weight_only_quantization( if (not torch.cuda.is_available() or device_map == "cpu") and model.config.model_type == "chatglm": model = model.float() - from neural_compressor.torch import load_empty_model - - model = load_empty_model( - model_id, - trust_remote_code=trust_remote_code, - ) model = convert_to_quantized_model(model, quantization_config, device=device_map) quantization_config.remove_redundant_parameters() model.config.quantization_config = quantization_config From 28aac242454fab5ecdc51deb5c37c34a63e057e9 Mon Sep 17 00:00:00 2001 From: changwa1 Date: Tue, 26 Nov 2024 15:24:11 +0800 Subject: [PATCH 04/13] improve ut Signed-off-by: changwa1 --- tests/neural_compressor/test_optimization.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 7d54214ca7..56054e480d 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -467,12 +467,14 @@ def _compute_metrics(pred): class WeightOnlyQuantizationTest(INCTestMixin): WEIGHT_ONLY_CONFIG = ( - ("rtn", 4), - ("gptq", 4), + ("rtn", 4, False), + ("rtn", 4, True), + ("gptq", 4, False), + ("gptq", 4, True), ) @parameterized.expand(WEIGHT_ONLY_CONFIG) - def test_weight_only_quantization(self, methodology, bits): + def test_weight_only_quantization(self, methodology, bits, use_layer_wise): from neural_compressor.transformers import GPTQConfig, RtnConfig model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM" @@ -489,10 +491,10 @@ def test_weight_only_quantization(self, methodology, bits): batch_size=5, seq_len=32, block_size=16, - use_layer_wise=True, + use_layer_wise=use_layer_wise, ) else: - quantization_config = RtnConfig(bits=bits, group_size=8, use_layer_wise=True) + quantization_config = RtnConfig(bits=bits, group_size=8, use_layer_wise=use_layer_wise) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.add_special_tokens({"pad_token": "[PAD]"}) From 18855fd957f1ae24973bb6469f3d7740fe3859ba Mon Sep 17 00:00:00 2001 From: changwa1 Date: Tue, 26 Nov 2024 18:00:45 +0800 Subject: [PATCH 05/13] fix loading kwargs issue Signed-off-by: changwa1 --- optimum/intel/neural_compressor/quantization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index b0abeed05a..8140dccb26 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -379,7 +379,7 @@ def _weight_only_quantization( if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model - model = load_empty_model(model_id, **loading_kwargs) + model = load_empty_model(model_id, cls=model_class, trust_remote_code=trust_remote_code) else: try: # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. @@ -398,7 +398,7 @@ def _weight_only_quantization( if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: from neural_compressor.torch import load_empty_model - model = load_empty_model(model_id, **loading_kwargs) + model = load_empty_model(model_id, cls=model_class, trust_remote_code=trust_remote_code) else: model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) quantization_config.post_init_cpu() From 0991631122caf8050cda71d819dffe9f4cd7de8c Mon Sep 17 00:00:00 2001 From: changwa1 Date: Wed, 27 Nov 2024 14:49:48 +0800 Subject: [PATCH 06/13] set neuralcompressor commit Signed-off-by: changwa1 --- setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b24ef4621b..1cae650d97 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,11 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.11.0"], "openvino": ["nncf>=2.11.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"], - "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"], + "neural-compressor": [ + "neural_compressor[pt]@git+https://github.com/intel/neural-compressor.git@5c72158a6799bdf0334ef36fbd493eeed3b62d9f", + "accelerate", + "transformers<4.46", + ], "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.45"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, From 3e21d57a61f4f67c8f849a6cb072f4464c19da74 Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Fri, 29 Nov 2024 17:27:52 +0300 Subject: [PATCH 07/13] Update optimum/intel/neural_compressor/quantization.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/intel/neural_compressor/quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 8140dccb26..e43bda0001 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -376,7 +376,7 @@ def _weight_only_quantization( low_cpu_mem_usage = True if use_xpu: - if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: + if getattr(quantization_config, "use_layer_wise", False): from neural_compressor.torch import load_empty_model model = load_empty_model(model_id, cls=model_class, trust_remote_code=trust_remote_code) From 62c91faf300ce56af70ddcf086ee6a9188a9123f Mon Sep 17 00:00:00 2001 From: sys-lpot-val Date: Fri, 29 Nov 2024 06:39:57 -0800 Subject: [PATCH 08/13] fix lay-wise model init Signed-off-by: sys-lpot-val --- optimum/intel/neural_compressor/quantization.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index e43bda0001..a21cd7a739 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -375,12 +375,12 @@ def _weight_only_quantization( low_cpu_mem_usage = True - if use_xpu: - if getattr(quantization_config, "use_layer_wise", False): - from neural_compressor.torch import load_empty_model + if getattr(quantization_config, "use_layer_wise", False): + from neural_compressor.torch import load_empty_model - model = load_empty_model(model_id, cls=model_class, trust_remote_code=trust_remote_code) - else: + model = load_empty_model(model_id, cls=model_class, trust_remote_code=trust_remote_code) + else: + if use_xpu: try: # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. model = model_class.from_pretrained( @@ -394,14 +394,9 @@ def _weight_only_quantization( model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) quantization_config.update(**{"device": "xpu"}) quantization_config.post_init_xpu() - else: - if hasattr(quantization_config, "use_layer_wise") and quantization_config.use_layer_wise: - from neural_compressor.torch import load_empty_model - - model = load_empty_model(model_id, cls=model_class, trust_remote_code=trust_remote_code) else: model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) - quantization_config.post_init_cpu() + quantization_config.post_init_cpu() model.config.update({"low_cpu_mem_usage": low_cpu_mem_usage}) model.eval() From 0b369160b250ff5514ecbdc247b4e9ff15a79749 Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 29 Nov 2024 07:42:42 -0800 Subject: [PATCH 09/13] fix quantization_config init Signed-off-by: changwangss --- optimum/intel/neural_compressor/quantization.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index a21cd7a739..dd4361bda0 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -392,11 +392,14 @@ def _weight_only_quantization( ) low_cpu_mem_usage = False model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) - quantization_config.update(**{"device": "xpu"}) - quantization_config.post_init_xpu() else: model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) - quantization_config.post_init_cpu() + + if use_xpu: + quantization_config.update(**{"device": "xpu"}) + quantization_config.post_init_xpu() + else: + quantization_config.post_init_cpu() model.config.update({"low_cpu_mem_usage": low_cpu_mem_usage}) model.eval() From 8c5851038fbe78bd2c9139d0a4780fa22a485cde Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 3 Dec 2024 02:44:04 -0800 Subject: [PATCH 10/13] add limit for use_layer_wise Signed-off-by: changwangss --- .../intel/neural_compressor/quantization.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index dd4361bda0..cf3f8dc07e 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -375,25 +375,11 @@ def _weight_only_quantization( low_cpu_mem_usage = True - if getattr(quantization_config, "use_layer_wise", False): + if getattr(quantization_config, "use_layer_wise", False) and token is None and subfolder == "": from neural_compressor.torch import load_empty_model - model = load_empty_model(model_id, cls=model_class, trust_remote_code=trust_remote_code) else: - if use_xpu: - try: - # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. - model = model_class.from_pretrained( - model_id, low_cpu_mem_usage=low_cpu_mem_usage, device_map="cpu", **loading_kwargs - ) - except NotImplementedError: - logger.info( - "Failed to load models with `low_cpu_mem_usage=True`, will fall to traditional load method resulting in higher memory consumption." - ) - low_cpu_mem_usage = False - model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) - else: - model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) + model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) if use_xpu: quantization_config.update(**{"device": "xpu"}) From b4232c396be0d52100ce4296d1aeff01d92d074e Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 3 Dec 2024 22:17:11 -0800 Subject: [PATCH 11/13] fix load_empty_model Signed-off-by: changwangss --- optimum/intel/neural_compressor/quantization.py | 5 +++-- setup.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index cf3f8dc07e..8d6d44698d 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -375,9 +375,10 @@ def _weight_only_quantization( low_cpu_mem_usage = True - if getattr(quantization_config, "use_layer_wise", False) and token is None and subfolder == "": + if getattr(quantization_config, "use_layer_wise", False): from neural_compressor.torch import load_empty_model - model = load_empty_model(model_id, cls=model_class, trust_remote_code=trust_remote_code) + + model = load_empty_model(model_id, cls=model_class, **loading_kwargs) else: model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) diff --git a/setup.py b/setup.py index 3a15828223..4e28426cc6 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ "nncf": ["nncf>=2.14.0"], "openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"], "neural-compressor": [ - "neural_compressor[pt]@git+https://github.com/intel/neural-compressor.git@5c72158a6799bdf0334ef36fbd493eeed3b62d9f", + "neural_compressor[pt]@git+https://github.com/intel/neural-compressor.git@3bc8e4d0035445c51b2bd5ff6196b9b19e92b3dd", "accelerate", "transformers<4.46", ], From dab42d89c8abd4dff7072c8ae7e0116a808fdf09 Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Thu, 5 Dec 2024 18:50:23 +0800 Subject: [PATCH 12/13] Update setup.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4e28426cc6..1bf904a745 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ "nncf": ["nncf>=2.14.0"], "openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"], "neural-compressor": [ - "neural_compressor[pt]@git+https://github.com/intel/neural-compressor.git@3bc8e4d0035445c51b2bd5ff6196b9b19e92b3dd", + "neural-compressor[pt]>3.0", "accelerate", "transformers<4.46", ], From 72bda6348d3ae8de22237ccd7ee69538e16a8660 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 5 Dec 2024 03:27:19 -0800 Subject: [PATCH 13/13] add version check for layerwise feature Signed-off-by: changwangss --- optimum/intel/neural_compressor/quantization.py | 7 +++++-- tests/neural_compressor/test_optimization.py | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 8d6d44698d..92e7fc57b9 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -376,9 +376,12 @@ def _weight_only_quantization( low_cpu_mem_usage = True if getattr(quantization_config, "use_layer_wise", False): - from neural_compressor.torch import load_empty_model + if is_neural_compressor_version(">=", "3.2"): + from neural_compressor.torch import load_empty_model - model = load_empty_model(model_id, cls=model_class, **loading_kwargs) + model = load_empty_model(model_id, cls=model_class, **loading_kwargs) + else: + raise ValueError("INC version must be >= 3.2 when use_layer_wise is set to True in quantization_config.") else: model = model_class.from_pretrained(model_id, low_cpu_mem_usage=low_cpu_mem_usage, **loading_kwargs) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 56054e480d..6b01baf705 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -45,7 +45,7 @@ set_seed, ) from utils_tests import MODEL_NAMES, SEED, INCTestMixin, _generate_dataset -from optimum.intel.utils.import_utils import is_torch_version +from optimum.intel.utils.import_utils import is_neural_compressor_version from optimum.intel import ( INCConfig, @@ -475,6 +475,8 @@ class WeightOnlyQuantizationTest(INCTestMixin): @parameterized.expand(WEIGHT_ONLY_CONFIG) def test_weight_only_quantization(self, methodology, bits, use_layer_wise): + if use_layer_wise and is_neural_compressor_version("<", "3.2"): + self.skipTest("INC version < 3.2 doesn't support layer-wise feature.") from neural_compressor.transformers import GPTQConfig, RtnConfig model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM"