diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 77dab40159..5686af4bf3 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -38,8 +38,6 @@ save_dir = "ptq_model" def preprocess_function(examples, tokenizer): return tokenizer(examples["sentence"], padding="max_length", max_length=128, truncation=True) -# Load the default quantization configuration detailing the quantization we wish to apply -quantization_config = OVConfig() # Instantiate our OVQuantizer using the desired configuration quantizer = OVQuantizer.from_pretrained(model) # Create the calibration dataset used to perform static quantization @@ -52,7 +50,6 @@ calibration_dataset = quantizer.get_calibration_dataset( ) # Apply static quantization and export the resulting quantized model to OpenVINO IR format quantizer.quantize( - quantization_config=quantization_config, calibration_dataset=calibration_dataset, save_directory=save_dir, ) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 9f3e3a06ca..8ddd005279 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -114,7 +114,7 @@ def __init__( **kwargs, ): super().__init__() - self.compression = compression or DEFAULT_QUANTIZATION_CONFIG + self.compression = compression self.input_info = input_info self.save_onnx_model = save_onnx_model self._enable_standard_onnx_export_option() diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 51633b0210..af00f7a06e 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -57,6 +57,7 @@ def __init__( dynamic_shapes: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): self.config = config @@ -91,6 +92,10 @@ def __init__( self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None + self._openvino_config = None + if quantization_config: + self._openvino_config = OVConfig(quantization_config=quantization_config) + @staticmethod def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None): """ @@ -143,6 +148,15 @@ def _save_pretrained(self, save_directory: Union[str, Path]): dst_path = os.path.join(save_directory, OV_XML_FILE_NAME) openvino.save_model(self.model, dst_path, compress_to_fp16=False) + self._save_openvino_config(save_directory) + + def _save_openvino_config(self, save_directory: Union[str, Path]): + if self._openvino_config is not None: + if not isinstance(self._openvino_config.quantization_config.dataset, (str, type(None))): + self._openvino_config.quantization_config.dataset = None + + self._openvino_config.save_pretrained(save_directory) + @classmethod def _from_pretrained( cls, @@ -203,12 +217,28 @@ def _from_pretrained( local_files_only=local_files_only, ) - # Give default quantization config if not provided and load_in_8bit=True - if load_in_8bit: - quantization_config = quantization_config or {"bits": 8} + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) model = cls.load_model(model_cache_path, quantization_config=quantization_config) - return cls(model, config=config, model_save_dir=model_cache_path.parent, **kwargs) + return cls( + model, + config=config, + model_save_dir=model_cache_path.parent, + quantization_config=quantization_config, + **kwargs, + ) + + @staticmethod + def _prepare_weight_quantization_config( + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, load_in_8bit: bool = False + ): + # Give default quantization config if not provided and load_in_8bit=True + if not quantization_config and load_in_8bit: + quantization_config = OVWeightQuantizationConfig(bits=8) + elif isinstance(quantization_config, dict): + quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) + + return quantization_config @staticmethod def _cached_file( diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index df9449b0b5..3cb43e61b8 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -58,6 +58,7 @@ def __init__( dynamic_shapes: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, **kwargs, ): self.config = config @@ -76,6 +77,9 @@ def __init__( self.decoder_model = decoder self.decoder_with_past_model = decoder_with_past self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None + self._openvino_config = None + if quantization_config: + self._openvino_config = OVConfig(quantization_config=quantization_config) def _save_pretrained(self, save_directory: Union[str, Path]): """ @@ -96,6 +100,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): dst_path = os.path.join(save_directory, dst_file_name) openvino.save_model(src_file, dst_path, compress_to_fp16=False) + self._save_openvino_config(save_directory) + @classmethod def _from_pretrained( cls, @@ -155,9 +161,7 @@ def _from_pretrained( decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name decoder_with_past = None - # Give default quantization config if not provided and load_in_8bit=True - if load_in_8bit: - quantization_config = quantization_config or {"bits": 8} + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) # Load model from a local directory if os.path.isdir(model_id): @@ -205,6 +209,7 @@ def _from_pretrained( decoder_with_past=decoder_with_past, config=config, model_save_dir=model_save_dir, + quantization_config=quantization_config, **kwargs, ) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index c0274d3f5b..92a2ce436d 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import logging import os from pathlib import Path @@ -100,6 +101,7 @@ def __init__( dynamic_shapes: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): if not dynamic_shapes: @@ -117,6 +119,7 @@ def __init__( dynamic_shapes=False, ov_config=ov_config, model_save_dir=model_save_dir, + quantization_config=quantization_config, **kwargs, ) @@ -224,6 +227,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): dst_path = os.path.join(save_directory, OV_XML_FILE_NAME) openvino.save_model(model_to_save, dst_path, compress_to_fp16=False) + self._save_openvino_config(save_directory) + @classmethod def _from_transformers( cls, @@ -576,15 +581,10 @@ def _from_pretrained( local_files_only=local_files_only, ) - # Give default quantization config if not provided and load_in_8bit=True - if load_in_8bit: - quantization_config = quantization_config or {"bits": 8} - - if isinstance(quantization_config, dict): - if quantization_config == {"bits": 4} and config.name_or_path in _DEFAULT_4BIT_CONFIGS: - quantization_config = _DEFAULT_4BIT_CONFIGS[config.name_or_path] + if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}: + quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config) - quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) load_in_4bit = quantization_config.bits == 4 if quantization_config else False model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config) @@ -603,7 +603,12 @@ def _from_pretrained( enable_compilation = kwargs.pop("compile", True) and not load_in_4bit causal_model = init_cls( - model=model, config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, **kwargs + model=model, + config=config, + model_save_dir=model_cache_path.parent, + compile=enable_compilation, + quantization_config=quantization_config, + **kwargs, ) if load_in_4bit: @@ -632,6 +637,7 @@ def _from_pretrained( # seqlen = get_seqlen(causal_model) dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32) dataset = prepare_dataset(dataset) + quantization_config = copy.deepcopy(quantization_config) quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) _weight_only_quantization(model, quantization_config) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 5633f852a8..1570a22457 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -87,6 +87,7 @@ def __init__( compile: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): self._internal_dict = config @@ -140,6 +141,10 @@ def __init__( self._internal_dict.pop("vae", None) + self._openvino_config = None + if quantization_config: + self._openvino_config = OVConfig(quantization_config=quantization_config) + def _save_pretrained(self, save_directory: Union[str, Path]): """ Saves the model to the OpenVINO IR format so that it can be re-loaded using the @@ -177,6 +182,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + self._save_openvino_config(save_directory) + @classmethod def _from_pretrained( cls, @@ -257,10 +264,7 @@ def _from_pretrained( else: kwargs[name] = load_method(new_model_save_dir) - # Give default quantization config if not provided and load_in_8bit=True - if load_in_8bit: - quantization_config = quantization_config or {"bits": 8} - + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) unet = cls.load_model( new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, quantization_config ) @@ -278,7 +282,14 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = new_model_save_dir - return cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) + return cls( + unet=unet, + config=config, + model_save_dir=model_save_dir, + quantization_config=quantization_config, + **components, + **kwargs, + ) @classmethod def _from_transformers( diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 331248e023..d7b88f2be3 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -45,7 +45,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available from ..utils.modeling_utils import get_model_device -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( MAX_ONNX_OPSET, @@ -235,8 +235,11 @@ def quantize( ) ov_config = ov_config or quantization_config - if ov_config is not None and not isinstance(ov_config, OVConfig): - raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + if ov_config is not None: + if not isinstance(ov_config, OVConfig): + raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + elif ov_config.compression is None: + ov_config.compression = DEFAULT_QUANTIZATION_CONFIG if isinstance(self.model, OVBaseModel): self._quantize_ovbasemodel( @@ -355,7 +358,7 @@ def _quantize_torchmodel( logger.info( "No configuration describing the quantization process was provided, a default OVConfig will be generated." ) - ov_config = OVConfig() + ov_config = OVConfig(compression=DEFAULT_QUANTIZATION_CONFIG) onnx_file_name = ( ONNX_WEIGHTS_NAME if file_name is None and ov_config.save_onnx_model diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 5c7d392292..b7d110c96a 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -89,7 +89,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_transformers_version -from .configuration import OVConfig +from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig from .quantization import OVDataLoader from .training_args import OVTrainingArguments from .utils import ( @@ -225,37 +225,41 @@ def __init__( self.teacher.eval() self.compression_controller = None - if self.ov_config is not None and self.args.do_train: - self._set_task() - train_dataloader = self.get_train_dataloader() - model_inputs = next(iter(train_dataloader)) - for label_name in self.label_names: - model_inputs.pop(label_name) - force_batch_one = self._is_pruning_enabled() - self.ov_config.add_input_info(model_inputs, force_batch_one) - nncf_config = NNCFConfig.from_dict(self.ov_config.__dict__) - nncf_config.register_extra_structs( - [ - QuantizationRangeInitArgs(OVDataLoader(train_dataloader)), - BNAdaptationInitArgs(OVDataLoader(train_dataloader)), - ] - ) + if self.ov_config is not None: + if self.ov_config.compression is None: + self.ov_config.compression = DEFAULT_QUANTIZATION_CONFIG + + if self.args.do_train: + self._set_task() + train_dataloader = self.get_train_dataloader() + model_inputs = next(iter(train_dataloader)) + for label_name in self.label_names: + model_inputs.pop(label_name) + force_batch_one = self._is_pruning_enabled() + self.ov_config.add_input_info(model_inputs, force_batch_one) + nncf_config = NNCFConfig.from_dict(self.ov_config.__dict__) + nncf_config.register_extra_structs( + [ + QuantizationRangeInitArgs(OVDataLoader(train_dataloader)), + BNAdaptationInitArgs(OVDataLoader(train_dataloader)), + ] + ) - # Configure NNCF logging - # Disable nncf logging to stdout except error - # but to file nncf_output.log - nncf_config["log_dir"] = args.output_dir - nncf_log_file_handler = logging.logging.FileHandler(os.path.join(args.output_dir, NNCF_LOG_FILE_NAME)) - nncf_log_file_handler.setFormatter(logging.logging.Formatter("%(levelname)s:%(name)s:%(message)s")) - nncf_logger.addHandler(nncf_log_file_handler) - set_log_level(logging.ERROR) - nncf_logger.setLevel(logging.INFO) - nncf_log_file_handler.setLevel(logging.INFO) - - self.compression_controller, self.model = create_compressed_model(self.model, nncf_config) - self.model_wrapped = self.model - # TODO : To deprecate once support transformers > 4.30.0 - self.deepspeed = None + # Configure NNCF logging + # Disable nncf logging to stdout except error + # but to file nncf_output.log + nncf_config["log_dir"] = args.output_dir + nncf_log_file_handler = logging.logging.FileHandler(os.path.join(args.output_dir, NNCF_LOG_FILE_NAME)) + nncf_log_file_handler.setFormatter(logging.logging.Formatter("%(levelname)s:%(name)s:%(message)s")) + nncf_logger.addHandler(nncf_log_file_handler) + set_log_level(logging.ERROR) + nncf_logger.setLevel(logging.INFO) + nncf_log_file_handler.setLevel(logging.INFO) + + self.compression_controller, self.model = create_compressed_model(self.model, nncf_config) + self.model_wrapped = self.model + # TODO : To deprecate once support transformers > 4.30.0 + self.deepspeed = None def _set_signature_columns_if_needed(self): if self._signature_columns is None: diff --git a/setup.py b/setup.py index 1701af990c..91fc19f744 100644 --- a/setup.py +++ b/setup.py @@ -39,16 +39,8 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] EXTRAS_REQUIRE = { - "neural-compressor": [ - "neural-compressor>=2.2.0", - "onnx", - "onnxruntime<1.15.0", - ], - "openvino": [ - "openvino>=2023.3", - "onnx", - "onnxruntime", - ], + "neural-compressor": ["neural-compressor>=2.2.0", "onnx", "onnxruntime<1.15.0"], + "openvino": ["openvino>=2023.3", "onnx", "onnxruntime"], "openvino-tokenizers": ["openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.8.1"], "ipex": ["intel-extension-for-pytorch", "onnx"], diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 16f848da9e..0ef89ec8b8 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -56,8 +56,7 @@ OVWeightQuantizationConfig, ) - -from optimum.intel.openvino.configuration import INT8_WEIGHT_COMPRESSION_CONFIG +from optimum.intel.openvino.configuration import INT8_WEIGHT_COMPRESSION_CONFIG, DEFAULT_QUANTIZATION_CONFIG from optimum.intel.openvino.quantization import InferRequestWrapper from optimum.intel.utils.import_utils import is_openvino_version from utils_tests import MODEL_NAMES, get_num_quantized_nodes, _ARCHITECTURES_TO_EXPECTED_INT8 @@ -111,9 +110,8 @@ def preprocess_function(examples, tokenizer): self.assertTrue("logits" in outputs) # Verify that that the configuration is correctly saved and loaded - expected_config = OVConfig() loaded_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(expected_config.to_dict()["compression"], loaded_config.to_dict()["compression"]) + self.assertEqual(DEFAULT_QUANTIZATION_CONFIG, loaded_config.to_dict()["compression"]) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): @@ -160,7 +158,7 @@ class OVWeightCompressionTest(unittest.TestCase): ) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 64, 365),) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 6, 379),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 388),) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS = ( (OVModelForCausalLM, "hf-internal-testing/tiny-random-OPTForCausalLM", 16, 136), ) @@ -237,6 +235,8 @@ class OVWeightCompressionTest(unittest.TestCase): IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") + DEFAULT_INT4_CONFIG = {"bits": 4, "sym": True, "group_size": 64, "all_layers": True} + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature @@ -336,6 +336,8 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False) + self.assertEqual(model._openvino_config.quantization_config.bits, 8) + self.assertEqual(model._openvino_config.dtype, "int8") if model.export_feature.startswith("text2text-generation"): models = [model.encoder, model.decoder, model.decoder_with_past] @@ -351,12 +353,13 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): self.assertEqual(expected_ov_int8[i], num_int8) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS) + @unittest.mock.patch.dict( + "optimum.intel.openvino.configuration._DEFAULT_4BIT_CONFIGS", {"facebook/opt-125m": DEFAULT_INT4_CONFIG} + ) def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_int8, expected_ov_int4): with tempfile.TemporaryDirectory() as tmp_dir: model_id = MODEL_NAMES[model_type] - model = model_cls.from_pretrained( - model_id, export=True, quantization_config=OVWeightQuantizationConfig(bits=4) - ) + model = model_cls.from_pretrained(model_id, export=True, quantization_config={"bits": 4}) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -366,6 +369,13 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ self.assertEqual(expected_ov_int8, num_int8) model.save_pretrained(tmp_dir) + openvino_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.dtype, "int4") + if model_id == "facebook/opt-125m": + for key, value in self.DEFAULT_INT4_CONFIG.items(): + self.assertEqual(value, openvino_config.quantization_config[key]) + @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( self, model_cls, model_id, quantization_config, expected_ov_int4 @@ -380,8 +390,9 @@ def test_ovmodel_4bit_auto_compression_with_config( self.assertEqual(expected_ov_int4, num_int4) model.save_pretrained(tmp_dir) - ov_config = OVConfig(quantization_config=quantization_config) - ov_config.save_pretrained(tmp_dir) + openvino_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.dtype, "int4") @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS) def test_ovmodel_4bit_auto_compression_with_custom_dataset(