From 6ed6a091365238e0076b3cca6189c2cc41a78843 Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Mon, 20 Jan 2025 14:06:50 +0000 Subject: [PATCH 1/3] Raise an error when OVQuantizer is invoked on an already compressed model --- optimum/intel/openvino/quantization.py | 44 ++++++++++++++++++++++++-- tests/openvino/test_quantization.py | 17 ++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index f61c2b93ca..cd337fdde7 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -315,6 +315,31 @@ def quantize( else: raise TypeError(f"Unsupported model type: {type(self.model)}") + def _check_model_state(self, sub_model_names: List[str] = None): + message_template = ( + "Couldn't apply optimization to the model because it was already compressed with config: {}. " + "To avoid this issue, set load_in_8bit=False in the from_pretrained method when using the optimum-intel API, " + "or explicitly specify the desired weight format using --weight_format fp16/fp32 for CLI." + ) + + def check_rt_info(ov_model): + rt_info = ov_model.get_rt_info() + if "nncf" in rt_info: + model_weight_compression_config = rt_info["nncf"].get("weight_compression", None) + model_quantization_config = rt_info["nncf"].get("quantization", None) + if model_weight_compression_config is not None: + raise RuntimeError(message_template.format(model_weight_compression_config)) + elif model_quantization_config is not None: + raise RuntimeError(message_template.format(model_quantization_config)) + + if sub_model_names is None: + check_rt_info(self.model.model) + else: + for name in sub_model_names: + if hasattr(self.model, name): + ov_model = getattr(self.model, name).model + check_rt_info(ov_model) + def _quantize_ovbasemodel( self, ov_config: OVConfig, @@ -325,7 +350,7 @@ def _quantize_ovbasemodel( remove_unused_columns: bool = True, **kwargs, ): - from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper + from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper, OVModelForSeq2SeqLM from optimum.intel.openvino.modeling_visual_language import OVModelForVisualCausalLM if is_diffusers_available(): @@ -404,6 +429,7 @@ def _quantize_ovbasemodel( "text_encoder_2", "text_encoder_3", ] + self._check_model_state(sub_model_names) sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config_copy, **kwargs) @@ -421,6 +447,7 @@ def _quantize_ovbasemodel( self.model.clear_requests() else: # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. + self._check_model_state() self.model.model = _hybrid_quantization( self.model.model, quantization_config, calibration_dataset, **kwargs ) @@ -436,19 +463,31 @@ def _quantize_ovbasemodel( "transformer", "text_encoder_3", ] + self._check_model_state(sub_model_names) sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config, **kwargs) self.model.clear_requests() elif isinstance(self.model, OVModelForVisualCausalLM): language_model = self.model.language_model - _weight_only_quantization(language_model.model, quantization_config, calibration_dataset, **kwargs) sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts + self._check_model_state(sub_model_names + ["language_model"]) + _weight_only_quantization(language_model.model, quantization_config, calibration_dataset, **kwargs) sub_models = [getattr(self.model, f"{name}_model") for name in sub_model_names] for sub_model in sub_models: _weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=True), **kwargs) self.model.clear_requests() + elif isinstance(self.model, OVModelForSeq2SeqLM): + sub_model_names = ["encoder", "decoder"] + if self.model.decoder_with_past is not None: + sub_model_names.append("decoder_with_past") + self._check_model_state(sub_model_names) + sub_models = [getattr(self.model, name) for name in sub_model_names] + for sub_model in sub_models: + _weight_only_quantization(sub_model, quantization_config, **kwargs) + self.model.clear_requests() else: + self._check_model_state() _weight_only_quantization(self.model.model, quantization_config, calibration_dataset, **kwargs) self.model.request = None else: @@ -460,6 +499,7 @@ def _quantize_ovbasemodel( # Quantize model(s) if isinstance(self.model, _OVModelForWhisper): + self._check_model_state(["encoder_model", "decoder_model", "decoder_with_past_model"]) self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs) else: quantized_model = _full_quantization( diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 1df43d5480..f1124cd88a 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -723,6 +723,23 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust _, num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_ov_int8[i], num_weight_nodes["int8"]) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) + def test_raise_error_WC_over_WC(self, model_cls, model_type, trust_remote_code): + model = model_cls.from_pretrained( + MODEL_NAMES[model_type], + export=True, + load_in_8bit=True, + trust_remote_code=trust_remote_code, + ) + quantization_config = OVWeightQuantizationConfig(bits=4, sym=True) + quantizer = OVQuantizer(model) + if isinstance(model, OVModelOpenCLIPForZeroShotImageClassification): + with pytest.raises(TypeError): + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) + else: + with pytest.raises(RuntimeError): + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_nodes, expected_int8_nodes): model_id = MODEL_NAMES[model_type] From e51f426dddd01b2718a7360c96d0f2cbf4d17f05 Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Thu, 23 Jan 2025 17:09:59 +0000 Subject: [PATCH 2/3] Update tests --- tests/openvino/test_quantization.py | 36 +++++++++++++++-------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index f1124cd88a..9baedd223c 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -214,6 +214,7 @@ def preprocess_function(examples, tokenizer): # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + check_optimization_not_applicable_to_optimized_model(model, quantization_config=OVWeightQuantizationConfig(bits=8)) @parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET) def test_ov_model_static_quantization_with_auto_dataset( @@ -255,6 +256,7 @@ def test_ov_model_static_quantization_with_auto_dataset( self.assertTrue("logits" in outputs) else: raise Exception("Unexpected model class.") + check_optimization_not_applicable_to_optimized_model(ov_model, quantization_config=quantization_config) class OVWeightCompressionTest(unittest.TestCase): @@ -718,28 +720,18 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust else: models = [model] + if model_type == "open-clip": + pytest.skip(reason="ticket 161043") + elif model_type == "t5": + pytest.skip(reason="ticket 160958") + else: + check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8}) + expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] for i, model in enumerate(models): _, num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_ov_int8[i], num_weight_nodes["int8"]) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) - def test_raise_error_WC_over_WC(self, model_cls, model_type, trust_remote_code): - model = model_cls.from_pretrained( - MODEL_NAMES[model_type], - export=True, - load_in_8bit=True, - trust_remote_code=trust_remote_code, - ) - quantization_config = OVWeightQuantizationConfig(bits=4, sym=True) - quantizer = OVQuantizer(model) - if isinstance(model, OVModelOpenCLIPForZeroShotImageClassification): - with pytest.raises(TypeError): - quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) - else: - with pytest.raises(RuntimeError): - quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_nodes, expected_int8_nodes): model_id = MODEL_NAMES[model_type] @@ -755,6 +747,7 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_ self.assertEqual(0, num_weight_nodes["int4"]) model.save_pretrained(tmp_dir) + check_optimization_not_applicable_to_optimized_model(model, quantization_config=quantization_config) def test_stable_diffusion_with_weight_compression(self): int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_NAMES["stable-diffusion"], export=True) @@ -769,6 +762,8 @@ def test_stable_diffusion_with_weight_compression(self): self.assertEqual(0, num_fake_nodes) self.assertEqual(242, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) + quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID) + check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config=quantization_config) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:]) def test_ovmodel_hybrid_quantization_with_custom_dataset( @@ -814,6 +809,7 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ if model_id == "facebook/opt-125m": for key, value in self.DEFAULT_INT4_CONFIG.items(): self.assertEqual(value, getattr(openvino_config.quantization_config, key)) + check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8}) @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( @@ -1338,3 +1334,9 @@ def test_calibration_data_uniqueness(self, model_name, apply_caching): else: # Without caching, encoder hidden states tensors will be unique for each collected input self.assertGreater(len(data_id_per_key["encoder_hidden_states"]), 2) + + +def check_optimization_not_applicable_to_optimized_model(model, quantization_config): + quantizer = OVQuantizer(model) + with pytest.raises(RuntimeError, match="Cannot apply optimization to the model because it was already optimized with the following config"): + quantizer.quantize(quantization_config=quantization_config) From 92facae2b2a57f39cd2c45d2e7da5540e87d23e9 Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Thu, 23 Jan 2025 19:08:15 +0000 Subject: [PATCH 3/3] apply comments --- optimum/intel/openvino/quantization.py | 63 +++++++++----------------- tests/openvino/test_quantization.py | 20 +++++--- 2 files changed, 34 insertions(+), 49 deletions(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index cd337fdde7..e89fa9b449 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -315,31 +315,6 @@ def quantize( else: raise TypeError(f"Unsupported model type: {type(self.model)}") - def _check_model_state(self, sub_model_names: List[str] = None): - message_template = ( - "Couldn't apply optimization to the model because it was already compressed with config: {}. " - "To avoid this issue, set load_in_8bit=False in the from_pretrained method when using the optimum-intel API, " - "or explicitly specify the desired weight format using --weight_format fp16/fp32 for CLI." - ) - - def check_rt_info(ov_model): - rt_info = ov_model.get_rt_info() - if "nncf" in rt_info: - model_weight_compression_config = rt_info["nncf"].get("weight_compression", None) - model_quantization_config = rt_info["nncf"].get("quantization", None) - if model_weight_compression_config is not None: - raise RuntimeError(message_template.format(model_weight_compression_config)) - elif model_quantization_config is not None: - raise RuntimeError(message_template.format(model_quantization_config)) - - if sub_model_names is None: - check_rt_info(self.model.model) - else: - for name in sub_model_names: - if hasattr(self.model, name): - ov_model = getattr(self.model, name).model - check_rt_info(ov_model) - def _quantize_ovbasemodel( self, ov_config: OVConfig, @@ -350,7 +325,7 @@ def _quantize_ovbasemodel( remove_unused_columns: bool = True, **kwargs, ): - from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper, OVModelForSeq2SeqLM + from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper from optimum.intel.openvino.modeling_visual_language import OVModelForVisualCausalLM if is_diffusers_available(): @@ -429,7 +404,6 @@ def _quantize_ovbasemodel( "text_encoder_2", "text_encoder_3", ] - self._check_model_state(sub_model_names) sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config_copy, **kwargs) @@ -447,7 +421,6 @@ def _quantize_ovbasemodel( self.model.clear_requests() else: # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. - self._check_model_state() self.model.model = _hybrid_quantization( self.model.model, quantization_config, calibration_dataset, **kwargs ) @@ -463,31 +436,19 @@ def _quantize_ovbasemodel( "transformer", "text_encoder_3", ] - self._check_model_state(sub_model_names) sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config, **kwargs) self.model.clear_requests() elif isinstance(self.model, OVModelForVisualCausalLM): language_model = self.model.language_model - sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts - self._check_model_state(sub_model_names + ["language_model"]) _weight_only_quantization(language_model.model, quantization_config, calibration_dataset, **kwargs) + sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts sub_models = [getattr(self.model, f"{name}_model") for name in sub_model_names] for sub_model in sub_models: _weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=True), **kwargs) self.model.clear_requests() - elif isinstance(self.model, OVModelForSeq2SeqLM): - sub_model_names = ["encoder", "decoder"] - if self.model.decoder_with_past is not None: - sub_model_names.append("decoder_with_past") - self._check_model_state(sub_model_names) - sub_models = [getattr(self.model, name) for name in sub_model_names] - for sub_model in sub_models: - _weight_only_quantization(sub_model, quantization_config, **kwargs) - self.model.clear_requests() else: - self._check_model_state() _weight_only_quantization(self.model.model, quantization_config, calibration_dataset, **kwargs) self.model.request = None else: @@ -499,7 +460,6 @@ def _quantize_ovbasemodel( # Quantize model(s) if isinstance(self.model, _OVModelForWhisper): - self._check_model_state(["encoder_model", "decoder_model", "decoder_with_past_model"]) self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs) else: quantized_model = _full_quantization( @@ -1050,6 +1010,7 @@ def _weight_only_quantization( calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, **kwargs, ) -> openvino.runtime.Model: + _verify_not_optimized(model) config = quantization_config if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) @@ -1106,6 +1067,7 @@ def _full_quantization( calibration_dataset: nncf.Dataset, **kwargs, ): + _verify_not_optimized(model) advanced_parameters_kwargs = {} if quantization_config.smooth_quant_alpha is not None: advanced_parameters_kwargs["smooth_quant_alphas"] = AdvancedSmoothQuantParameters( @@ -1227,3 +1189,20 @@ def _hybrid_quantization( **kwargs, ) return quantized_model + + +def _verify_not_optimized(ov_model): + message_template = ( + "Cannot apply optimization to the model because it was already optimized with the following config: {}. " + "To avoid this issue, check that you set load_in_8bit=False or not using quantization_config at export in the .from_pretrained(), " + "or explicitly specify weight format with --weight_format fp16/fp32 when using CLI." + ) + + rt_info = ov_model.get_rt_info() + if "nncf" in rt_info: + model_weight_compression_config = rt_info["nncf"].get("weight_compression", None) + model_quantization_config = rt_info["nncf"].get("quantization", None) + if model_weight_compression_config is not None: + raise RuntimeError(message_template.format(model_weight_compression_config)) + elif model_quantization_config is not None: + raise RuntimeError(message_template.format(model_quantization_config)) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 9baedd223c..c4c0ff247d 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -214,7 +214,9 @@ def preprocess_function(examples, tokenizer): # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) - check_optimization_not_applicable_to_optimized_model(model, quantization_config=OVWeightQuantizationConfig(bits=8)) + check_optimization_not_applicable_to_optimized_model( + model, quantization_config=OVWeightQuantizationConfig(bits=8) + ) @parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET) def test_ov_model_static_quantization_with_auto_dataset( @@ -256,7 +258,6 @@ def test_ov_model_static_quantization_with_auto_dataset( self.assertTrue("logits" in outputs) else: raise Exception("Unexpected model class.") - check_optimization_not_applicable_to_optimized_model(ov_model, quantization_config=quantization_config) class OVWeightCompressionTest(unittest.TestCase): @@ -747,7 +748,7 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_ self.assertEqual(0, num_weight_nodes["int4"]) model.save_pretrained(tmp_dir) - check_optimization_not_applicable_to_optimized_model(model, quantization_config=quantization_config) + check_optimization_not_applicable_to_optimized_model(model, quantization_config) def test_stable_diffusion_with_weight_compression(self): int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_NAMES["stable-diffusion"], export=True) @@ -762,8 +763,10 @@ def test_stable_diffusion_with_weight_compression(self): self.assertEqual(0, num_fake_nodes) self.assertEqual(242, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) - quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID) - check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config=quantization_config) + quantization_config = OVWeightQuantizationConfig( + bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID + ) + check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:]) def test_ovmodel_hybrid_quantization_with_custom_dataset( @@ -1338,5 +1341,8 @@ def test_calibration_data_uniqueness(self, model_name, apply_caching): def check_optimization_not_applicable_to_optimized_model(model, quantization_config): quantizer = OVQuantizer(model) - with pytest.raises(RuntimeError, match="Cannot apply optimization to the model because it was already optimized with the following config"): - quantizer.quantize(quantization_config=quantization_config) + with pytest.raises( + RuntimeError, + match="Cannot apply optimization to the model because it was already optimized with the following config", + ): + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))