From c16fba73fda4b8676b7a6dce0f3aceaacb6bef14 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 11:59:34 +0200 Subject: [PATCH 01/26] add warning --- optimum/intel/openvino/modeling_decoder.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 44137186e2..122972fad4 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -597,6 +597,11 @@ def _from_pretrained( load_in_4bit = quantization_config.bits == 4 if quantization_config else False calibration_dataset = kwargs.get("calibration_dataset", None) + + if calibration_dataset is not None: + logger.warning( + "The `calibration_dataset` argument is deprecated and will be removed, to apply quantization using a custom dataset, please use the `OVQuantizer`." + ) model = cls.load_model( model_cache_path, quantization_config=None if load_in_4bit else quantization_config, From 5805bf6fbf3be7984f51e250b784d9a13f95b809 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 14:31:44 +0200 Subject: [PATCH 02/26] remove deprecated feature arg --- optimum/intel/openvino/quantization.py | 24 ++++++++---------------- optimum/intel/openvino/trainer.py | 9 +-------- 2 files changed, 9 insertions(+), 24 deletions(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 33985dbe6e..d80a4ca6fc 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -180,22 +180,15 @@ def __init__(self, model: transformers.PreTrainedModel, task: Optional[str] = No """ super().__init__() self.model = model - feature = kwargs.pop("feature", None) - if feature is not None: - logger.warning("`feature` is deprecated and will be removed in a future version. Use `task` instead.") - if task is not None and task != feature: - logger.warning( - f"Both `feature` and `task` were specified. {task} will be used to define the model topology for the model ONNX export." - ) - self.task = task or feature + self.task = task self.seed = seed - # TODO : deprecate input_names - self.input_names = None signature = inspect.signature(self.model.forward) self._signature_columns = list(signature.parameters.keys()) - self._export_input_names = [ - column for column in self._signature_columns if column not in {"label", "labels", "label_ids"} - ] + + @property + def input_names(self): + logger.warning("The`input_names` attribute is deprecated and will be removed in v1.18.0") + return None @classmethod def from_pretrained(cls, model: PreTrainedModel, **kwargs): @@ -265,9 +258,8 @@ def quantize( # TODO: deprecate weights_only argument if weights_only is not None: logger.warning( - "`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` " - "as an instance of OVWeightQuantizationConfig for weight-only compression or as an instance of " - "OVQuantizationConfig for full model quantization." + "`weights_only` argument is deprecated and will be removed in v1.18.0. In the future please provide `ov_config.quantization_config` " + "as an instance of `OVWeightQuantizationConfig` for weight-only compression or as an instance of `OVQuantizationConfig` for full model quantization." ) if save_directory is None: diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 873b0909c8..0745a1cd79 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -214,7 +214,6 @@ def __init__( preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None, ov_config: Optional[OVConfig] = None, task: Optional[str] = None, - feature: Optional[str] = None, ): self.neftune_noise_alpha = None @@ -233,13 +232,7 @@ def __init__( ) self.ov_config = ov_config - if feature is not None: - logger.warning("`feature` is deprecated and will be removed in a future version. Use `task` instead.") - if task is not None and task != feature: - logger.warning( - f"Both `feature` and `task` were specified. {task} will be used to define the model topology for the model ONNX export." - ) - self.task = task or feature + self.task = task self.teacher = None if teacher_model is not None: self.teacher = teacher_model.to(args.device) From e9b6aa0df40f79e9ee6774148a3767385dbc9de5 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 14:41:16 +0200 Subject: [PATCH 03/26] add model arch --- tests/openvino/test_quantization.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b22d5e3955..724f5550a2 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -76,14 +76,14 @@ class OVQuantizerTest(unittest.TestCase): - # TODO : add models, enable OVModelForCausalLM. SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( - (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 32, 35), - # (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 23), + (OVModelForSequenceClassification, "bert", 32, 35), + (OVModelForCausalLM, "gpt2", 41, 23), ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_automodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): + def test_automodel_static_quantization(self, model_cls, model_arch, expected_fake_quantize, expected_int8): + model_name = MODEL_NAMES[model_arch] task = model_cls.export_feature dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task] file_name = "openvino_quantized_model.xml" @@ -128,7 +128,8 @@ def preprocess_function(examples, tokenizer): self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): + def test_ovmodel_static_quantization(self, model_cls, model_arch, expected_fake_quantize, expected_int8): + model_name = MODEL_NAMES[model_arch] task = model_cls.export_feature dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task] if "gpt2" in model_name: @@ -188,13 +189,13 @@ class OVWeightCompressionTest(unittest.TestCase): LOAD_IN_4_BITS_SCOPE = ( ( OVModelForCausalLM, - "hf-internal-testing/tiny-random-gpt2", + "gpt2", dict(bits=4, sym=False, group_size=-1, ratio=0.8), 14, ), ( OVModelForCausalLM, - "hf-internal-testing/tiny-random-gpt2", + "gpt2", dict( bits=4, sym=False, @@ -205,13 +206,13 @@ class OVWeightCompressionTest(unittest.TestCase): ), ( OVModelForCausalLM, - "hf-internal-testing/tiny-random-gpt2", + "gpt2", dict(bits=4, sym=False, group_size=-1, ratio=0.8, all_layers=True), 18, ), ( OVModelForCausalLM, - "hf-internal-testing/tiny-random-OPTForCausalLM", + "opt", dict( bits=4, sym=True, @@ -224,7 +225,7 @@ class OVWeightCompressionTest(unittest.TestCase): ), ( OVModelForCausalLM, - "hf-internal-testing/tiny-random-OPTForCausalLM", + "opt", dict( bits=4, sym=True, @@ -451,8 +452,9 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( - self, model_cls, model_id, quantization_config, expected_ov_int4 + self, model_cls, model_arch, quantization_config, expected_ov_int4 ): + model_id = MODEL_NAMES[model_arch] with tempfile.TemporaryDirectory() as tmp_dir: quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) From 868848a7ce47a1c3441601bb634f52da25002d46 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 14:51:13 +0200 Subject: [PATCH 04/26] rmeove calibration dataset argument --- optimum/intel/openvino/modeling_base.py | 5 +- optimum/intel/openvino/modeling_decoder.py | 6 -- tests/openvino/test_quantization.py | 85 ++++++---------------- 3 files changed, 25 insertions(+), 71 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index d5b19bb28c..4afa3c58aa 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -103,7 +103,6 @@ def __init__( def load_model( file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, - calibration_dataset: Optional = None, ): """ Loads the model. @@ -113,8 +112,6 @@ def load_model( The path of the model ONNX or XML file. quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*): Quantization config to apply after model is loaded. - calibration_dataset (`nncf.Dataset`, *optional*): - Optional nncf.Dataset to feed to model weight compression when quantization config is provided. """ def fix_op_names_duplicates(model: openvino.runtime.Model): @@ -143,7 +140,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): from optimum.intel.openvino.quantization import _weight_only_quantization - model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset) + model = _weight_only_quantization(model, quantization_config) return model diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 122972fad4..3a7b9004e9 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -596,16 +596,10 @@ def _from_pretrained( quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) load_in_4bit = quantization_config.bits == 4 if quantization_config else False - calibration_dataset = kwargs.get("calibration_dataset", None) - if calibration_dataset is not None: - logger.warning( - "The `calibration_dataset` argument is deprecated and will be removed, to apply quantization using a custom dataset, please use the `OVQuantizer`." - ) model = cls.load_model( model_cache_path, quantization_config=None if load_in_4bit else quantization_config, - calibration_dataset=calibration_dataset, ) model_type = config.model_type.replace("_", "-") diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 724f5550a2..9e9280f376 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -82,8 +82,8 @@ class OVQuantizerTest(unittest.TestCase): ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_automodel_static_quantization(self, model_cls, model_arch, expected_fake_quantize, expected_int8): - model_name = MODEL_NAMES[model_arch] + def test_automodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): + model_id = MODEL_NAMES[model_name] task = model_cls.export_feature dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task] file_name = "openvino_quantized_model.xml" @@ -92,8 +92,8 @@ def preprocess_function(examples, tokenizer): return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True) with tempfile.TemporaryDirectory() as tmp_dir: - transformers_model = model_cls.auto_model_class.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) + transformers_model = model_cls.auto_model_class.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) @@ -128,19 +128,19 @@ def preprocess_function(examples, tokenizer): self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_ovmodel_static_quantization(self, model_cls, model_arch, expected_fake_quantize, expected_int8): - model_name = MODEL_NAMES[model_arch] + def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): + model_id = MODEL_NAMES[model_name] task = model_cls.export_feature dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task] - if "gpt2" in model_name: + if "gpt2" in model_id: expected_int8 -= 1 def preprocess_function(examples, tokenizer): return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True) with tempfile.TemporaryDirectory() as tmp_dir: - transformers_model = model_cls.from_pretrained(model_name, export=True) - tokenizer = AutoTokenizer.from_pretrained(model_name) + transformers_model = model_cls.from_pretrained(model_id, export=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) @@ -173,17 +173,15 @@ def preprocess_function(examples, tokenizer): class OVWeightCompressionTest(unittest.TestCase): # TODO : add models SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ( - (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 70, 70), - (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 44, 44), + (OVModelForSequenceClassification, "bert", 70, 70), + (OVModelForCausalLM, "gpt2", 44, 44), ) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 86),) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 148),) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS = ( - (OVModelForCausalLM, "hf-internal-testing/tiny-random-OPTForCausalLM", 14, 50), - ) + SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ( - (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 44, 44), + (OVModelForCausalLM, "gpt2", 44, 44), ) LOAD_IN_4_BITS_SCOPE = ( @@ -266,10 +264,11 @@ class OVWeightCompressionTest(unittest.TestCase): @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature + model_id = MODEL_NAMES[model_name] with tempfile.TemporaryDirectory() as tmp_dir: - transformers_model = model_cls.auto_model_class.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) + transformers_model = model_cls.auto_model_class.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -296,10 +295,11 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature + model_id = MODEL_NAMES[model_name] with tempfile.TemporaryDirectory() as tmp_dir: - transformers_model = model_cls.from_pretrained(model_name, export=True) - tokenizer = AutoTokenizer.from_pretrained(model_name) + transformers_model = model_cls.from_pretrained(model_id, export=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -321,9 +321,8 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS) def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8, expected_int4): task = model_cls.export_feature - + model_id = MODEL_NAMES[model_name] with tempfile.TemporaryDirectory() as tmp_dir: - model_id = MODEL_NAMES[model_name] transformers_model = model_cls.from_pretrained(model_id, export=True, stateful=False) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -351,9 +350,9 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i @parameterized.expand(SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above") - def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, expected_pt_int8, expected_ov_int8): + def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature - + model_id = MODEL_NAMES[model_name] with tempfile.TemporaryDirectory() as tmp_dir: transformers_model = model_cls.from_pretrained(model_id, export=True, stateful=True) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -452,9 +451,9 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( - self, model_cls, model_arch, quantization_config, expected_ov_int4 + self, model_cls, model_name, quantization_config, expected_ov_int4 ): - model_id = MODEL_NAMES[model_arch] + model_id = MODEL_NAMES[model_name] with tempfile.TemporaryDirectory() as tmp_dir: quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) @@ -470,42 +469,6 @@ def test_ovmodel_4bit_auto_compression_with_config( self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, "int4") - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS) - def test_ovmodel_4bit_auto_compression_with_custom_dataset( - self, model_cls, model_id, expected_int8, expected_int4 - ): - task = model_cls.export_feature - - tokenizer = AutoTokenizer.from_pretrained(model_id) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - dataset_name, dataset_config_name, column = _TASK_TO_DATASET[task] - dataset = load_dataset(dataset_name, dataset_config_name, split="test") - - def transform_fn(data, tokenizer): - tokenized_text = tokenizer(data[column], return_tensors="np") - input_ids = tokenized_text["input_ids"] - attention_mask = tokenized_text["attention_mask"] - inputs = {} - inputs["input_ids"] = input_ids - inputs["attention_mask"] = attention_mask - batch_size = input_ids.shape[0] - inputs["beam_idx"] = np.arange(batch_size, dtype=int) - return inputs - - quantization_dataset = nncf.Dataset(dataset, partial(transform_fn, tokenizer=tokenizer)) - model = model_cls.from_pretrained( - model_id, - export=True, - quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), - calibration_dataset=quantization_dataset, - ) - - _, num_int8, num_int4 = get_num_quantized_nodes(model) - self.assertEqual(expected_int8, num_int8) - self.assertEqual(expected_int4, num_int4) - @parameterized.expand(((OVModelForCausalLM, "gpt2"),)) @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above") def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_type): From f8e513c054ccd05e239eeca3cbd8e7e6101781bd Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 14:52:17 +0200 Subject: [PATCH 05/26] format --- optimum/intel/openvino/modeling_decoder.py | 2 +- tests/openvino/test_quantization.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 3a7b9004e9..e109468079 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -636,7 +636,7 @@ def _from_pretrained( f"For the given model, we recommend the following `quantization_config` : {default_config}" ) - if calibration_dataset is None and isinstance(quantization_config.dataset, str): + if isinstance(quantization_config.dataset, str): tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id) from optimum.gptq.data import get_dataset, prepare_dataset diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 9e9280f376..3ebe6f8db6 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -180,9 +180,7 @@ class OVWeightCompressionTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 86),) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 148),) - SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ( - (OVModelForCausalLM, "gpt2", 44, 44), - ) + SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "gpt2", 44, 44),) LOAD_IN_4_BITS_SCOPE = ( ( From 946360788793141910cedc4a585e1afbb3c8cbe1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 14:58:15 +0200 Subject: [PATCH 06/26] remove comments --- optimum/intel/openvino/modeling_decoder.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index e109468079..91bf614e87 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -641,13 +641,9 @@ def _from_pretrained( from optimum.gptq.data import get_dataset, prepare_dataset - # from optimum.gptq.utils import get_seqlen - - # seqlen = get_seqlen(causal_model) - nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 + nsamples = quantization_config.num_samples or 128 dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) - quantization_config = copy.deepcopy(quantization_config) calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) _weight_only_quantization(model, quantization_config, calibration_dataset) From 4fde5ad308ff616c62aee620c6390f63b0878c3b Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 15:23:53 +0200 Subject: [PATCH 07/26] minor --- optimum/intel/openvino/modeling_decoder.py | 1 - tests/openvino/test_quantization.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 91bf614e87..ea06063c96 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import logging import os from pathlib import Path diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 3ebe6f8db6..07101d2a5e 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -78,7 +78,7 @@ class OVQuantizerTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( (OVModelForSequenceClassification, "bert", 32, 35), - (OVModelForCausalLM, "gpt2", 41, 23), + # (OVModelForCausalLM, "gpt2", 41, 23), ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) From f138a4dfce2ba06469411cb3a25c0c4d06f99dcb Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 15:25:14 +0200 Subject: [PATCH 08/26] fix ignore --- .github/workflows/test_openvino.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index bff5cb525f..7eafb266e1 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -35,7 +35,7 @@ jobs: pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime - name: Test with Pytest run: | - pytest tests/openvino/ --ignore test_modeling_basic --durations=0 + pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0 - name: Test openvino-nightly run: | pip uninstall -y openvino From 41c49a6755e1895494a6cf1213a56e2631171741 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 16:20:37 +0200 Subject: [PATCH 09/26] fix --- optimum/intel/openvino/modeling_decoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index ea06063c96..39a7bee9a2 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -635,6 +635,7 @@ def _from_pretrained( f"For the given model, we recommend the following `quantization_config` : {default_config}" ) + calibration_dataset = None if isinstance(quantization_config.dataset, str): tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id) From 15f11049b6f699c4b838df0180757c57c8a59f90 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 17:49:10 +0200 Subject: [PATCH 10/26] replace preset with sym for compatibility between configs --- optimum/intel/openvino/configuration.py | 36 ++++++++++++++----------- tests/openvino/test_quantization.py | 11 ++++---- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index e75301729d..f9888449c9 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -52,6 +52,7 @@ } + @dataclass class OVQuantizationConfigBase(QuantizationConfigMixin): """ @@ -309,9 +310,10 @@ def post_init(self): class OVQuantizationConfig(OVQuantizationConfigBase): def __init__( self, + bits: int = 8, + sym: bool = False, ignored_scope: Optional[dict] = None, num_samples: Optional[int] = 300, - preset: nncf.QuantizationPreset = None, model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, fast_bias_correction: bool = True, overflow_fix: OverflowFix = OverflowFix.DISABLE, @@ -323,18 +325,15 @@ def __init__( compression, during quantization both weights and activations are converted to lower precision. For weight-only model quantization please see OVWeightQuantizationConfig. Args: + bits (`int`, defaults to 8): + The number of bits to quantize to. ignored_scope (`dict`, *optional*): An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. - preset (`nncf.QuantizationPreset`, *optional*): - A preset controls the quantization mode (symmetric and asymmetric). - It can take the following values: - - `performance`: Symmetric quantization of weights and activations. - - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations. - Default value is None. In this case, `mixed` preset is used for `transformer` - model type otherwise `performance`. + sym (`bool`, defaults to `False`): + Whether to use symmetric quantization on the activations. Symmetric quantization will be applied on the weights in any case. model_type (`nncf.ModelType`, defaults to nncf.ModelType.TRANSFORMER): Model type is needed to specify additional patterns in the model. Supported only `transformer` now. fast_bias_correction (`bool`, defaults to True): @@ -354,10 +353,9 @@ def __init__( # TODO: remove checks below once NNCF is updated to 2.10 if isinstance(overflow_fix, str): overflow_fix = OverflowFix(overflow_fix) - if isinstance(preset, str): - preset = nncf.QuantizationPreset(preset) - self.preset = preset + self.bits = bits + self.sym = sym self.model_type = model_type self.fast_bias_correction = fast_bias_correction self.overflow_fix = overflow_fix @@ -365,7 +363,7 @@ def __init__( def to_dict(self) -> Dict[str, Any]: # TODO: remove code below once NNCF is updated to 2.10 - if isinstance(self.overflow_fix, Enum) or isinstance(self.preset, Enum): + if isinstance(self.overflow_fix, Enum): overflow_fix_value = ( None if self.overflow_fix is None @@ -373,15 +371,21 @@ def to_dict(self) -> Dict[str, Any]: if isinstance(self.overflow_fix, str) else self.overflow_fix.value ) - preset_value = ( - None if self.preset is None else self.preset if isinstance(self.preset, str) else self.preset.value - ) self_copy = copy.deepcopy(self) self_copy.overflow_fix = overflow_fix_value - self_copy.preset = preset_value return self_copy.to_dict() return super().to_dict() + def post_init(self): + r""" + Safety checker that arguments are correct + """ + super().post_init() + + if self.bits != 8: + raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}") + + def _check_default_4bit_configs(config: PretrainedConfig): return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 07101d2a5e..b198bfde06 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -114,10 +114,9 @@ def preprocess_function(examples, tokenizer): ) model = model_cls.from_pretrained(tmp_dir, file_name=file_name) - # TODO: uncomment once move to a newer version of NNCF which has some fixes (addmm, baddmm) - # num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model) - # self.assertEqual(expected_fake_quantize, num_fake_quantize) - # self.assertEqual(expected_int8, num_int8) + num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model) + self.assertEqual(expected_fake_quantize, num_fake_quantize) + self.assertEqual(expected_int8, num_int8) tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) @@ -711,7 +710,7 @@ class OVQuantizationConfigTest(unittest.TestCase): OVQuantizationConfig( ignored_scope={"names": ["op_name"]}, num_samples=100, - preset=nncf.QuantizationPreset.MIXED, + sym=False, model_type=nncf.ModelType.TRANSFORMER, fast_bias_correction=True, overflow_fix=OverflowFix.DISABLE, @@ -757,7 +756,7 @@ class OVQuantizationConfigTest(unittest.TestCase): dict( ignored_scope={"names": ["op_name"]}, num_samples=100, - preset=nncf.QuantizationPreset.MIXED, + sym=False, model_type=nncf.ModelType.TRANSFORMER, fast_bias_correction=True, overflow_fix=OverflowFix.DISABLE, From 675665ecf866241511aafc917b39d8b2a466b644 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 15 Apr 2024 17:49:25 +0200 Subject: [PATCH 11/26] format --- optimum/intel/openvino/configuration.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index f9888449c9..9aa223a0f0 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -52,7 +52,6 @@ } - @dataclass class OVQuantizationConfigBase(QuantizationConfigMixin): """ @@ -376,16 +375,15 @@ def to_dict(self) -> Dict[str, Any]: return self_copy.to_dict() return super().to_dict() - def post_init(self): r""" Safety checker that arguments are correct """ super().post_init() - + if self.bits != 8: raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}") - + def _check_default_4bit_configs(config: PretrainedConfig): return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) From 486e6d77a2f4a30e266119d0b56a8bae274804b3 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Apr 2024 13:56:06 +0200 Subject: [PATCH 12/26] add dynamic quantization --- optimum/intel/openvino/modeling_base.py | 16 +++++++++++----- optimum/intel/openvino/modeling_base_seq2seq.py | 7 ++----- optimum/intel/openvino/modeling_diffusion.py | 5 ++--- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 4afa3c58aa..e317cc0fbe 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -31,7 +31,7 @@ from ...exporters.openvino import export, main_export from ..utils.import_utils import is_nncf_available -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVDynamicQuantizationConfig, OVWeightQuantizationConfig from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, _print_compiled_model_properties @@ -64,10 +64,7 @@ def __init__( self.model_save_dir = model_save_dir self._device = device.upper() self.is_dynamic = dynamic_shapes - self.ov_config = ov_config if ov_config is not None else {} - if self.ov_config.get("PERFORMANCE_HINT") is None: - self.ov_config["PERFORMANCE_HINT"] = "LATENCY" - + self.ov_config = {} if ov_config is None else {**ov_config} self.preprocessors = kwargs.get("preprocessors", []) enable_compilation = kwargs.get("compile", True) @@ -98,6 +95,7 @@ def __init__( self._openvino_config = None if quantization_config: self._openvino_config = OVConfig(quantization_config=quantization_config) + self._set_ov_config_parameters() @staticmethod def load_model( @@ -248,6 +246,14 @@ def _prepare_weight_quantization_config( return quantization_config + def _set_ov_config_parameters(self): + if self.ov_config.get("PERFORMANCE_HINT") is None: + self.ov_config["PERFORMANCE_HINT"] = "LATENCY" + + q_config = self._openvino_config.quantization_config if self._openvino_config else None + if isinstance(q_config, OVDynamicQuantizationConfig): + self.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = str(q_config.activations_group_size) + @staticmethod def _cached_file( model_path: Union[Path, str], diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 0daf9dfdd3..78648e93d2 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -66,11 +66,7 @@ def __init__( self.model_save_dir = model_save_dir self._device = device.upper() self.is_dynamic = dynamic_shapes - self.ov_config = ov_config if ov_config is not None else {} - - if self.ov_config.get("PERFORMANCE_HINT") is None: - self.ov_config["PERFORMANCE_HINT"] = "LATENCY" - + self.ov_config = {} if ov_config is None else {**ov_config} self.preprocessors = kwargs.get("preprocessors", []) if self.is_dynamic: @@ -84,6 +80,7 @@ def __init__( self._openvino_config = None if quantization_config: self._openvino_config = OVConfig(quantization_config=quantization_config) + self._set_ov_config_parameters() def _save_pretrained(self, save_directory: Union[str, Path]): """ diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index eb407b4cd1..a64a013484 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -100,9 +100,7 @@ def __init__( self._internal_dict = config self._device = device.upper() self.is_dynamic = dynamic_shapes - self.ov_config = ov_config if ov_config is not None else {} - if self.ov_config.get("PERFORMANCE_HINT") is None: - self.ov_config["PERFORMANCE_HINT"] = "LATENCY" + self.ov_config = {} if ov_config is None else {**ov_config} # This attribute is needed to keep one reference on the temporary directory, since garbage collecting # would end-up removing the directory containing the underlying OpenVINO model @@ -162,6 +160,7 @@ def __init__( self._openvino_config = None if quantization_config: self._openvino_config = OVConfig(quantization_config=quantization_config) + self._set_ov_config_parameters() def _save_pretrained(self, save_directory: Union[str, Path]): """ From ce66da36dee1c0095e565ba37f17a1ac0f2945d0 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Apr 2024 14:04:58 +0200 Subject: [PATCH 13/26] add dynamic config --- optimum/intel/openvino/configuration.py | 47 +++++++++++-------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 9aa223a0f0..41198f3035 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -62,7 +62,6 @@ def __init__( self, ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, - weight_only: Optional[bool] = None, **kwargs, ): """ @@ -72,14 +71,11 @@ def __init__( entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. - weight_only (`bool`, *optional*): - Used to explicitly specify type of quantization (weight-only of full) to apply. """ if isinstance(ignored_scope, nncf.IgnoredScope): ignored_scope = ignored_scope.__dict__ self.ignored_scope = ignored_scope self.num_samples = num_samples - self.weight_only = weight_only def post_init(self): try: @@ -191,6 +187,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): Args: bits (`int`, defaults to 8): The number of bits to quantize to. + group_size (`int`, *optional*): + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. sym (`bool`, defaults to `False`): Whether to use symmetric quantization. tokenizer (`str`, *optional*): @@ -209,8 +207,6 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). - group_size (`int`, *optional*): - The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. all_layers (`bool`, *optional*): Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision. sensitivity_metric (`str`, *optional*): @@ -223,33 +219,24 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): The maximum number of samples composing the calibration dataset. quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): Weight compression method to apply. - weight_only (`bool`, *optional*): - Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building - the config from dictionary. """ def __init__( self, bits: int = 8, + group_size: Optional[int] = None, sym: bool = False, tokenizer: Optional[str] = None, dataset: Optional[Union[str, List[str]]] = None, ratio: float = 1.0, - group_size: Optional[int] = None, all_layers: Optional[bool] = None, sensitivity_metric: Optional[str] = None, ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT, - weight_only: Optional[bool] = True, **kwargs, ): - if weight_only is False: - logger.warning( - "Trying to create an instance of `OVWeightQuantizationConfig` with `weight_only` being " - "False. Please check your configuration." - ) - super().__init__(ignored_scope, num_samples, True) + super().__init__(ignored_scope, num_samples) self.bits = bits self.sym = sym self.tokenizer = tokenizer @@ -305,6 +292,21 @@ def post_init(self): raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}") +@dataclass +class OVDynamicQuantizationConfig(OVWeightQuantizationConfig): + + def __init__( + self, + bits: int = 8, + weights_group_size: Optional[int] = None, + activations_group_size: int = 32, + **kwargs, + ): + super().__init__(bits=bits, group_size=weights_group_size, **kwargs) + # TODO add kv_cache_dtype + self.activations_group_size = activations_group_size + + @dataclass class OVQuantizationConfig(OVQuantizationConfigBase): def __init__( @@ -316,7 +318,6 @@ def __init__( model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, fast_bias_correction: bool = True, overflow_fix: OverflowFix = OverflowFix.DISABLE, - weight_only: Optional[bool] = False, **kwargs, ): """ @@ -339,16 +340,8 @@ def __init__( Whether to apply fast or full bias correction algorithm. overflow_fix (`nncf.OverflowFix`, default to OverflowFix.DISABLE): Parameter for controlling overflow fix setting. - weight_only (`bool`, *optional*): - Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building - the config from dictionary. """ - if weight_only is True: - logger.warning( - "Trying to create an instance of `OVQuantizationConfig` with `weight_only` being True. " - "Please check your configuration." - ) - super().__init__(ignored_scope, num_samples, False) + super().__init__(ignored_scope, num_samples) # TODO: remove checks below once NNCF is updated to 2.10 if isinstance(overflow_fix, str): overflow_fix = OverflowFix(overflow_fix) From d86de3a5555add81f360d0d1e8e8df2a1c11b8e0 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Apr 2024 14:23:38 +0200 Subject: [PATCH 14/26] remove test deprecated config parameter --- optimum/intel/openvino/configuration.py | 1 - tests/openvino/test_quantization.py | 8 -------- 2 files changed, 9 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 41198f3035..9aab666823 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -294,7 +294,6 @@ def post_init(self): @dataclass class OVDynamicQuantizationConfig(OVWeightQuantizationConfig): - def __init__( self, bits: int = 8, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b198bfde06..33d7c232f3 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -770,12 +770,6 @@ class OVQuantizationConfigTest(unittest.TestCase): (dict(abc="def", weight_only=True), OVWeightQuantizationConfig, None), (dict(bits=8, fast_bias_correction=True, weight_only=True), OVWeightQuantizationConfig, None), (dict(bits=8, fast_bias_correction=True, weight_only=False), OVQuantizationConfig, None), - (dict(bits=8, sym=True, weight_only=False), OVWeightQuantizationConfig, "Please check your configuration"), - ( - dict(model_type=nncf.ModelType.TRANSFORMER, weight_only=True), - OVQuantizationConfig, - "Please check your configuration", - ), ) @parameterized.expand(QUANTIZATION_CONFIGS) @@ -818,8 +812,6 @@ def test_config_from_dict(self, quantization_config: dict, config_type: type, wa ov_config = OVConfig(quantization_config=quantization_config) self.assertIsInstance(ov_config.quantization_config, config_type) for k, v in quantization_config.items(): - if k == "weight_only" and warning_log == "Please check your configuration": - continue if hasattr(ov_config.quantization_config, k): self.assertEqual(getattr(ov_config.quantization_config, k), v) From 1e46ac97a3cbffe2f0af8ae845d68d3d9793ccd0 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Apr 2024 16:00:38 +0200 Subject: [PATCH 15/26] fix --- optimum/intel/openvino/quantization.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index d80a4ca6fc..37020e2dd8 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -371,7 +371,7 @@ def _quantize_ovbasemodel( subset_size=quantization_config.num_samples, ignored_scope=quantization_config.get_ignored_scope_instance(), model_type=quantization_config.model_type, - preset=quantization_config.preset, + preset=nncf.QuantizationPreset.PERFORMANCE if quantization_config.sym else nncf.QuantizationPreset.MIXED, fast_bias_correction=quantization_config.fast_bias_correction, advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), **kwargs, @@ -469,7 +469,9 @@ def _quantize_torchmodel( subset_size=quantization_config.num_samples, ignored_scope=quantization_config.get_ignored_scope_instance(), model_type=quantization_config.model_type, - preset=quantization_config.preset, + preset=nncf.QuantizationPreset.PERFORMANCE + if quantization_config.sym + else nncf.QuantizationPreset.MIXED, fast_bias_correction=quantization_config.fast_bias_correction, advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), **kwargs, From 69c955bd992f616ec4f2092a4e219d95f44647e0 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 14:56:31 +0200 Subject: [PATCH 16/26] add bits and sym to base config --- optimum/intel/openvino/configuration.py | 193 ++++++++++++------------ 1 file changed, 100 insertions(+), 93 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 16ccd3c057..ff44d30bac 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -69,22 +69,31 @@ class OVQuantizationConfigBase(QuantizationConfigMixin): def __init__( self, + bits: int = 8, + sym: bool = False, ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, **kwargs, ): """ Args: + bits (`int`, defaults to 8): + The number of bits to quantize to. + sym (`bool`, defaults to `False`): + Whether to use symmetric quantization. ignored_scope (`dict`, *optional*): An ignored scope that defines a list of model nodes to be ignored during quantization. Dictionary entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. """ + self.bits = bits + self.sym = sym + self.num_samples = num_samples + if isinstance(ignored_scope, nncf.IgnoredScope): ignored_scope = ignored_scope.__dict__ self.ignored_scope = ignored_scope - self.num_samples = num_samples def post_init(self): try: @@ -102,87 +111,6 @@ def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) -class OVConfig(BaseConfig): - CONFIG_NAME = "openvino_config.json" - FULL_CONFIGURATION_FILE = "openvino_config.json" - - def __init__( - self, - input_info: Optional[List] = None, - save_onnx_model: bool = False, - quantization_config: Optional[Union[dict, OVQuantizationConfigBase]] = None, - dtype: Optional[str] = None, - **kwargs, - ): - super().__init__() - self.input_info = input_info - self.save_onnx_model = save_onnx_model - self.optimum_version = kwargs.pop("optimum_version", None) - if isinstance(quantization_config, dict): - quantization_config = self._quantization_config_from_dict(quantization_config) - self.quantization_config = quantization_config - self.compression = None # A field for backward-compatability of training-time compression parameters - - bits = ( - self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None - ) - self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype - - def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): - self.input_info = [ - { - "sample_size": [1] + list(value.shape[1:]) if force_batch_one else list(value.shape), - "type": "long" if value.dtype is torch.int64 else "float", - "keyword": name, - } - for name, value in model_inputs.items() - ] - - @staticmethod - def _quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: - wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args - q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args - config_keys = quantization_config.keys() - matches_wq_config_signature = all(arg_name in wq_args for arg_name in config_keys) - matches_q_config_signature = all(arg_name in q_args for arg_name in config_keys) - if matches_wq_config_signature == matches_q_config_signature: - weight_only = quantization_config.get("weight_only", None) - if weight_only is None: - logger.warning( - "Can't determine type of OV quantization config. Please specify explicitly whether you intend to " - "run weight-only quantization or not with `weight_only` parameter. Creating an instance of " - "OVWeightQuantizationConfig." - ) - return OVWeightQuantizationConfig.from_dict(quantization_config) - matches_wq_config_signature = weight_only - - config_type = OVWeightQuantizationConfig if matches_wq_config_signature else OVQuantizationConfig - return config_type.from_dict(quantization_config) - - def _to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: - class ConfigStub: - def to_dict(self): - return None - - def to_diff_dict(self): - return None - - if self.quantization_config is None: - # Parent to_dict() implementation does not support quantization_config being None - self_copy = copy.deepcopy(self) - self_copy.quantization_config = ConfigStub() - result = self_copy.to_diff_dict() if to_diff_dict else self_copy.to_dict() - else: - result = super().to_diff_dict() if to_diff_dict else super().to_dict() - return result - - def to_dict(self) -> Dict[str, Any]: - return self._to_dict_safe(to_diff_dict=False) - - def to_diff_dict(self) -> Dict[str, Any]: - return self._to_dict_safe(to_diff_dict=True) - - @dataclass class OVWeightQuantizationConfig(OVQuantizationConfigBase): """ @@ -192,10 +120,10 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): Args: bits (`int`, defaults to 8): The number of bits to quantize to. + sym (`bool`, defaults to `False`): + Whether to use symmetric quantization on the weights. group_size (`int`, *optional*): The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. - sym (`bool`, defaults to `False`): - Whether to use symmetric quantization. tokenizer (`str`, *optional*): The tokenizer used to process the dataset. You can pass either: - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. @@ -229,8 +157,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): def __init__( self, bits: int = 8, - group_size: Optional[int] = None, sym: bool = False, + group_size: Optional[int] = None, tokenizer: Optional[str] = None, dataset: Optional[Union[str, List[str]]] = None, ratio: float = 1.0, @@ -241,9 +169,7 @@ def __init__( quant_method: Union[QuantizationMethod, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT, **kwargs, ): - super().__init__(ignored_scope, num_samples) - self.bits = bits - self.sym = sym + super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples) self.tokenizer = tokenizer self.dataset = dataset self.group_size = group_size or (-1 if bits == 8 else 128) @@ -302,11 +228,12 @@ class OVDynamicQuantizationConfig(OVWeightQuantizationConfig): def __init__( self, bits: int = 8, + sym: bool = False, weights_group_size: Optional[int] = None, activations_group_size: int = 32, **kwargs, ): - super().__init__(bits=bits, group_size=weights_group_size, **kwargs) + super().__init__(bits=bits, sym=sym, group_size=weights_group_size, **kwargs) # TODO add kv_cache_dtype self.activations_group_size = activations_group_size @@ -329,6 +256,8 @@ def __init__( compression, during quantization both weights and activations are converted to lower precision. For weight-only model quantization please see OVWeightQuantizationConfig. Args: + bits (`int`, defaults to 8): + The number of bits to quantize to. sym (`bool`, defaults to `False`): Whether to use symmetric quantization on the activations. Symmetric quantization will be applied on the weights in any case. ignored_scope (`dict`, *optional*): @@ -343,15 +272,12 @@ def __init__( overflow_fix (`str`, default to "disable"): Parameter for controlling overflow fix setting. """ - super().__init__(ignored_scope, num_samples) - self.bits = bits - self.sym = sym + super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples) self.model_type = model_type self.fast_bias_correction = fast_bias_correction self.overflow_fix = overflow_fix self.post_init() - def post_init(self): r""" Safety checker that arguments are correct @@ -364,3 +290,84 @@ def post_init(self): def _check_default_4bit_configs(config: PretrainedConfig): return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) + + +class OVConfig(BaseConfig): + CONFIG_NAME = "openvino_config.json" + FULL_CONFIGURATION_FILE = "openvino_config.json" + + def __init__( + self, + input_info: Optional[List] = None, + save_onnx_model: bool = False, + quantization_config: Optional[Union[dict, OVQuantizationConfigBase]] = None, + dtype: Optional[str] = None, + **kwargs, + ): + super().__init__() + self.input_info = input_info + self.save_onnx_model = save_onnx_model + self.optimum_version = kwargs.pop("optimum_version", None) + if isinstance(quantization_config, dict): + quantization_config = self._quantization_config_from_dict(quantization_config) + self.quantization_config = quantization_config + self.compression = None # A field for backward-compatability of training-time compression parameters + + bits = ( + self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None + ) + self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype + + def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): + self.input_info = [ + { + "sample_size": [1] + list(value.shape[1:]) if force_batch_one else list(value.shape), + "type": "long" if value.dtype is torch.int64 else "float", + "keyword": name, + } + for name, value in model_inputs.items() + ] + + @staticmethod + def _quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: + wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args + q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args + config_keys = quantization_config.keys() + matches_wq_config_signature = all(arg_name in wq_args for arg_name in config_keys) + matches_q_config_signature = all(arg_name in q_args for arg_name in config_keys) + if matches_wq_config_signature == matches_q_config_signature: + weight_only = quantization_config.get("weight_only", None) + if weight_only is None: + logger.warning( + "Can't determine type of OV quantization config. Please specify explicitly whether you intend to " + "run weight-only quantization or not with `weight_only` parameter. Creating an instance of " + "OVWeightQuantizationConfig." + ) + return OVWeightQuantizationConfig.from_dict(quantization_config) + matches_wq_config_signature = weight_only + + config_type = OVWeightQuantizationConfig if matches_wq_config_signature else OVQuantizationConfig + return config_type.from_dict(quantization_config) + + def _to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: + class ConfigStub: + def to_dict(self): + return None + + def to_diff_dict(self): + return None + + if self.quantization_config is None: + # Parent to_dict() implementation does not support quantization_config being None + self_copy = copy.deepcopy(self) + self_copy.quantization_config = ConfigStub() + result = self_copy.to_diff_dict() if to_diff_dict else self_copy.to_dict() + else: + result = super().to_diff_dict() if to_diff_dict else super().to_dict() + return result + + def to_dict(self) -> Dict[str, Any]: + return self._to_dict_safe(to_diff_dict=False) + + def to_diff_dict(self) -> Dict[str, Any]: + return self._to_dict_safe(to_diff_dict=True) From eb1a8439428c9392dadef821119cb058f35e2ede Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 14:57:04 +0200 Subject: [PATCH 17/26] fix --- optimum/intel/openvino/configuration.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index ff44d30bac..a3e96ff4cc 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -312,10 +312,7 @@ def __init__( quantization_config = self._quantization_config_from_dict(quantization_config) self.quantization_config = quantization_config self.compression = None # A field for backward-compatability of training-time compression parameters - - bits = ( - self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None - ) + bits = self.quantization_config.bits if self.quantization_config else None self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): @@ -332,11 +329,11 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): def _quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args + weight_only = quantization_config.pop("weight_only", None) config_keys = quantization_config.keys() matches_wq_config_signature = all(arg_name in wq_args for arg_name in config_keys) matches_q_config_signature = all(arg_name in q_args for arg_name in config_keys) if matches_wq_config_signature == matches_q_config_signature: - weight_only = quantization_config.get("weight_only", None) if weight_only is None: logger.warning( "Can't determine type of OV quantization config. Please specify explicitly whether you intend to " From 4471552ae8bbd916ea120ea0e62eaa6f1ff866ec Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 15:00:35 +0200 Subject: [PATCH 18/26] updated message --- optimum/intel/openvino/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index a3e96ff4cc..1c950c3a23 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -191,7 +191,7 @@ def post_init(self): if not (self.dataset is None or isinstance(self.dataset, (str, list))): raise ValueError( f"Dataset must be a instance of either string or list of strings, but found {type(self.dataset)}. " - f"If you wish to provide a custom dataset please pass it via `calibration_dataset` argument." + f"If you wish to provide a custom dataset, please use the `OVQuantizer` instead." ) if self.dataset is not None and isinstance(self.dataset, str): llm_datasets = ["wikitext", "c4", "c4-new", "ptb", "ptb-new"] From ec367429ff9f2b86ef9d43f4eb86beca259c2348 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 16:26:40 +0200 Subject: [PATCH 19/26] add config test --- optimum/intel/__init__.py | 7 ++++--- optimum/intel/openvino/__init__.py | 2 +- tests/openvino/test_quantization.py | 17 +++++------------ 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 615e23801e..f983ba40ff 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -60,11 +60,11 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: _import_structure["utils.dummy_openvino_and_nncf_objects"].extend( - ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig"] + ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig", "OVDynamicQuantizationConfig"] ) else: _import_structure["openvino"].extend( - ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig"] + ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig", "OVDynamicQuantizationConfig"] ) @@ -195,9 +195,10 @@ OVQuantizer, OVTrainingArguments, OVWeightQuantizationConfig, + OVDynamicQuantizationConfig, ) else: - from .openvino import OVQuantizationConfig, OVQuantizer, OVTrainingArguments, OVWeightQuantizationConfig + from .openvino import OVQuantizationConfig, OVQuantizer, OVTrainingArguments, OVWeightQuantizationConfig, OVDynamicQuantizationConfig try: if not (is_openvino_available() and is_nncf_available() and is_accelerate_available()): diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 0cd7d8a029..03c27c07b8 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -43,7 +43,7 @@ from .trainer import OVTrainer -from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig, OVDynamicQuantizationConfig from .modeling import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 443cdb38b6..1ce2e0ae01 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -62,6 +62,7 @@ OVTrainer, OVQuantizationConfig, OVWeightQuantizationConfig, + OVDynamicQuantizationConfig, ) from optimum.intel.openvino.configuration import OVQuantizationMethod, OVQuantizationConfigBase @@ -684,12 +685,7 @@ class OVQuantizationConfigTest(unittest.TestCase): QUANTIZATION_CONFIGS = ( (None,), (OVWeightQuantizationConfig(),), - ( - OVWeightQuantizationConfig( - bits=8, - sym=True, - ), - ), + (OVWeightQuantizationConfig(bits=8,sym=True),), ( OVWeightQuantizationConfig( dataset="wikitext", @@ -717,6 +713,7 @@ class OVQuantizationConfigTest(unittest.TestCase): ), ), (OVQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])),), + (OVDynamicQuantizationConfig(bits=8,sym=True),), ) QUANTIZATION_CONFIG_DICTS = ( @@ -746,11 +743,7 @@ class OVQuantizationConfigTest(unittest.TestCase): ), (dict(num_samples=100), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), (dict(abc="def"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), - ( - dict(bits=8, fast_bias_correction=True), - OVWeightQuantizationConfig, - "Can't determine type of OV quantization config", - ), + (dict(bits=8, fast_bias_correction=True, dataset="wikitext"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), (dict(model_type="transformer"), OVQuantizationConfig, None), ( dict( @@ -768,7 +761,7 @@ class OVQuantizationConfigTest(unittest.TestCase): (dict(weight_only=False), OVQuantizationConfig, None), (dict(abc="def", weight_only=False), OVQuantizationConfig, None), (dict(abc="def", weight_only=True), OVWeightQuantizationConfig, None), - (dict(bits=8, fast_bias_correction=True, weight_only=True), OVWeightQuantizationConfig, None), + (dict(bits=8, fast_bias_correction=True, dataset="wikitext", weight_only=True), OVWeightQuantizationConfig, None), (dict(bits=8, fast_bias_correction=True, weight_only=False), OVQuantizationConfig, None), ) From 985513e30a2c1fe23cb99b72ce32c35b4809fec1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 16:42:07 +0200 Subject: [PATCH 20/26] format --- optimum/intel/__init__.py | 26 ++++++++++++++++++++++---- optimum/intel/openvino/__init__.py | 2 +- tests/openvino/test_quantization.py | 16 ++++++++++++---- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index f983ba40ff..590b4281c2 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -60,11 +60,23 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: _import_structure["utils.dummy_openvino_and_nncf_objects"].extend( - ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig", "OVDynamicQuantizationConfig"] + [ + "OVQuantizer", + "OVTrainingArguments", + "OVQuantizationConfig", + "OVWeightQuantizationConfig", + "OVDynamicQuantizationConfig", + ] ) else: _import_structure["openvino"].extend( - ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig", "OVDynamicQuantizationConfig"] + [ + "OVQuantizer", + "OVTrainingArguments", + "OVQuantizationConfig", + "OVWeightQuantizationConfig", + "OVDynamicQuantizationConfig", + ] ) @@ -191,14 +203,20 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from .utils.dummy_openvino_and_nncf_objects import ( + OVDynamicQuantizationConfig, OVQuantizationConfig, OVQuantizer, OVTrainingArguments, OVWeightQuantizationConfig, - OVDynamicQuantizationConfig, ) else: - from .openvino import OVQuantizationConfig, OVQuantizer, OVTrainingArguments, OVWeightQuantizationConfig, OVDynamicQuantizationConfig + from .openvino import ( + OVDynamicQuantizationConfig, + OVQuantizationConfig, + OVQuantizer, + OVTrainingArguments, + OVWeightQuantizationConfig, + ) try: if not (is_openvino_available() and is_nncf_available() and is_accelerate_available()): diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 03c27c07b8..68d3a15cfa 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -43,7 +43,7 @@ from .trainer import OVTrainer -from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig, OVDynamicQuantizationConfig +from .configuration import OVConfig, OVDynamicQuantizationConfig, OVQuantizationConfig, OVWeightQuantizationConfig from .modeling import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 1ce2e0ae01..39246300da 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -685,7 +685,7 @@ class OVQuantizationConfigTest(unittest.TestCase): QUANTIZATION_CONFIGS = ( (None,), (OVWeightQuantizationConfig(),), - (OVWeightQuantizationConfig(bits=8,sym=True),), + (OVWeightQuantizationConfig(bits=8, sym=True),), ( OVWeightQuantizationConfig( dataset="wikitext", @@ -713,7 +713,7 @@ class OVQuantizationConfigTest(unittest.TestCase): ), ), (OVQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])),), - (OVDynamicQuantizationConfig(bits=8,sym=True),), + (OVDynamicQuantizationConfig(bits=8, sym=True),), ) QUANTIZATION_CONFIG_DICTS = ( @@ -743,7 +743,11 @@ class OVQuantizationConfigTest(unittest.TestCase): ), (dict(num_samples=100), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), (dict(abc="def"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), - (dict(bits=8, fast_bias_correction=True, dataset="wikitext"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + ( + dict(bits=8, fast_bias_correction=True, dataset="wikitext"), + OVWeightQuantizationConfig, + "Can't determine type of OV quantization config", + ), (dict(model_type="transformer"), OVQuantizationConfig, None), ( dict( @@ -761,7 +765,11 @@ class OVQuantizationConfigTest(unittest.TestCase): (dict(weight_only=False), OVQuantizationConfig, None), (dict(abc="def", weight_only=False), OVQuantizationConfig, None), (dict(abc="def", weight_only=True), OVWeightQuantizationConfig, None), - (dict(bits=8, fast_bias_correction=True, dataset="wikitext", weight_only=True), OVWeightQuantizationConfig, None), + ( + dict(bits=8, fast_bias_correction=True, dataset="wikitext", weight_only=True), + OVWeightQuantizationConfig, + None, + ), (dict(bits=8, fast_bias_correction=True, weight_only=False), OVQuantizationConfig, None), ) From 31b04604ea59198a9b7e9bf2212d24bd85c093bd Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 17:45:47 +0200 Subject: [PATCH 21/26] add kv cache precision --- optimum/intel/openvino/modeling_decoder.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 39a7bee9a2..11b487b96d 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -328,6 +328,12 @@ def _make_stateful(self): patch_stateful(self.config, self.model) self.stateful = True + def _set_ov_config_parameters(self): + super()._set_ov_config_parameters() + + if "DYNAMIC_QUANTIZATION_GROUP_SIZE" in self.ov_config and "KV_CACHE_PRECISION" not in self.ov_config: + self.ov_config["KV_CACHE_PRECISION"] = "u8" + @add_start_docstrings( """ From 3c12b86f2d425de9223dbe8320edc975ba1f5cbb Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 17:45:53 +0200 Subject: [PATCH 22/26] format --- tests/openvino/test_quantization.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 39246300da..67ccef3497 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -328,10 +328,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, ratio=0.8)) - quantizer.quantize( - save_directory=tmp_dir, - ov_config=ov_config, - ) + quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config) model = model_cls.from_pretrained(tmp_dir) _, num_int8, num_int4 = get_num_quantized_nodes(model) From a4016b298adb9eabd0e97e036594ba891b8acafc Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 18:11:07 +0200 Subject: [PATCH 23/26] add test --- optimum/intel/openvino/configuration.py | 1 - tests/openvino/test_quantization.py | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 1c950c3a23..739b5640a6 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -234,7 +234,6 @@ def __init__( **kwargs, ): super().__init__(bits=bits, sym=sym, group_size=weights_group_size, **kwargs) - # TODO add kv_cache_dtype self.activations_group_size = activations_group_size diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 67ccef3497..d09043a855 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -552,6 +552,29 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) + @parameterized.expand(LOAD_IN_4_BITS_SCOPE) + def test_ovmodel_4bit_dynamic_with_config(self, model_cls, model_name, quantization_config, expected_ov_int4): + model_id = MODEL_NAMES[model_name] + with tempfile.TemporaryDirectory() as tmp_dir: + group_size = quantization_config.pop("group_size", 32) + quantization_config = OVDynamicQuantizationConfig(weights_group_size=group_size, activations_group_size=group_size, **quantization_config) + model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) + self.assertEqual(model.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"], str(group_size)) + self.assertEqual(model.ov_config["KV_CACHE_PRECISION"], "u8") + + tokenizer = AutoTokenizer.from_pretrained(model_id) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + _, num_int4, _ = get_num_quantized_nodes(model) + self.assertEqual(expected_ov_int4, num_int4) + model.save_pretrained(tmp_dir) + + openvino_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(openvino_config.quantization_config.bits, 4) + self.assertEqual(openvino_config.dtype, "int4") + + class OVQuantizerQATest(unittest.TestCase): SUPPORTED_ARCHITECTURES = (("hf-internal-testing/tiny-random-BertForQuestionAnswering",),) From 35046063b3fcda289fcdf463acbc3900ede174d9 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 18:11:29 +0200 Subject: [PATCH 24/26] format --- tests/openvino/test_quantization.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index d09043a855..26dfc658a5 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -551,13 +551,14 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) - @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_dynamic_with_config(self, model_cls, model_name, quantization_config, expected_ov_int4): model_id = MODEL_NAMES[model_name] with tempfile.TemporaryDirectory() as tmp_dir: group_size = quantization_config.pop("group_size", 32) - quantization_config = OVDynamicQuantizationConfig(weights_group_size=group_size, activations_group_size=group_size, **quantization_config) + quantization_config = OVDynamicQuantizationConfig( + weights_group_size=group_size, activations_group_size=group_size, **quantization_config + ) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) self.assertEqual(model.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"], str(group_size)) self.assertEqual(model.ov_config["KV_CACHE_PRECISION"], "u8") From dbf77e43f41e2504b951bf793dd9ef4e6313793f Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 18:18:56 +0200 Subject: [PATCH 25/26] move compilation step --- optimum/intel/openvino/modeling_base.py | 6 +++--- optimum/intel/openvino/modeling_diffusion.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index e317cc0fbe..e26a4fd0c9 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -87,9 +87,6 @@ def __init__( self.model = model self.request = None - if enable_compilation: - self.compile() - self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None self._openvino_config = None @@ -97,6 +94,9 @@ def __init__( self._openvino_config = OVConfig(quantization_config=quantization_config) self._set_ov_config_parameters() + if enable_compilation: + self.compile() + @staticmethod def load_model( file_name: Union[str, Path], diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 46e3e70b09..fb9bec7a8e 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -140,9 +140,6 @@ def __init__( if self.is_dynamic: self.reshape(batch_size=-1, height=-1, width=-1, num_images_per_prompt=-1) - if compile: - self.compile() - sub_models = { DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder, DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet, @@ -162,6 +159,9 @@ def __init__( self._openvino_config = OVConfig(quantization_config=quantization_config) self._set_ov_config_parameters() + if compile: + self.compile() + def _save_pretrained(self, save_directory: Union[str, Path]): """ Saves the model to the OpenVINO IR format so that it can be re-loaded using the From 17debf6345d218ceb9dc75ab3b1db2343d11d0c1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 19 Apr 2024 18:23:18 +0200 Subject: [PATCH 26/26] set kv cache precision for seq2seq models --- optimum/intel/openvino/modeling_base.py | 2 ++ optimum/intel/openvino/modeling_decoder.py | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index e26a4fd0c9..a48cdf5c92 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -253,6 +253,8 @@ def _set_ov_config_parameters(self): q_config = self._openvino_config.quantization_config if self._openvino_config else None if isinstance(q_config, OVDynamicQuantizationConfig): self.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = str(q_config.activations_group_size) + if self.can_generate() and "KV_CACHE_PRECISION" not in self.ov_config: + self.ov_config["KV_CACHE_PRECISION"] = "u8" @staticmethod def _cached_file( diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 11b487b96d..39a7bee9a2 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -328,12 +328,6 @@ def _make_stateful(self): patch_stateful(self.config, self.model) self.stateful = True - def _set_ov_config_parameters(self): - super()._set_ov_config_parameters() - - if "DYNAMIC_QUANTIZATION_GROUP_SIZE" in self.ov_config and "KV_CACHE_PRECISION" not in self.ov_config: - self.ov_config["KV_CACHE_PRECISION"] = "u8" - @add_start_docstrings( """