From 0fe3ced3146cd689ac325aecc3bb5df426621fe6 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 17 Apr 2024 16:05:11 +0200 Subject: [PATCH 1/4] Fix quantization call in QA notebook --- .../openvino/question_answering_quantization.ipynb | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb index ba4a84ca38..196e3ba6a7 100644 --- a/notebooks/openvino/question_answering_quantization.ipynb +++ b/notebooks/openvino/question_answering_quantization.ipynb @@ -51,7 +51,7 @@ "import transformers\n", "from evaluate import evaluator\n", "from openvino.runtime import Core\n", - "from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer\n", + "from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n", "from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n", "\n", "transformers.logging.set_verbosity_error()\n", @@ -286,11 +286,11 @@ "**NOTE:** if you notice very low accuracy after post-training quantization, it is likely caused by an overflow issue which affects processors that do not contain VNNI (Vector Neural Network Instruction). NNCF has an `overflow_fix` option to address this. It will effectively use 7-bits for quantizing instead of 8-bits to prevent the overflow. To use this option, modify the code in the next cell to add an explicit quantization configuration, and set `overflow_fix` to `\"enable\"`:\n", "\n", "```\n", - "from optimum.intel.openvino import OVConfig\n", + "from optimum.intel.openvino import OVConfig, OVQuantizationConfig\n", "\n", - "ov_config = OVConfig()\n", - "ov_config.compression[\"overflow_fix\"] = \"enable\"\n", - "quantizer = OVQuantizer.from_pretrained(model, ov_config=ov_config)\n", + "ov_config = OVConfig(quantization_config=OVQuantizationConfig(overflow_fix=\"enable\")\n", + "quantizer = OVQuantizer.from_pretrained(model)\n", + "quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path, ov_config=ov_config)\n", "```\n", "\n", "For more information, see [Lower Numerical Precision Deep Learning Inference and Training](https://www.intel.com/content/www/us/en/developer/articles/technical/lower-numerical-precision-deep-learning-inference-and-training.html)" @@ -317,7 +317,8 @@ "\n", "# Quantize the model\n", "quantizer = OVQuantizer.from_pretrained(model)\n", - "quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path)" + "ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n", + "quantizer.quantize(calibration_dataset=train_dataset, ov_config=ov_config, save_directory=int8_ptq_model_path)" ] }, { From 4f8893794b8340e1fe317f689bdf4e9afdbe39e8 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 17 Apr 2024 16:11:42 +0200 Subject: [PATCH 2/4] Update OV quantization docs --- README.md | 5 +++-- docs/source/optimization_ov.mdx | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 78ca130145..e93cacf057 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ Post-training static quantization introduces an additional calibration step wher ```python from functools import partial -from optimum.intel import OVQuantizer, OVModelForSequenceClassification +from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig from transformers import AutoTokenizer, AutoModelForSequenceClassification model_id = "distilbert-base-uncased-finetuned-sst-2-english" @@ -145,7 +145,8 @@ calibration_dataset = quantizer.get_calibration_dataset( # The directory where the quantized model will be saved save_dir = "nncf_results" # Apply static quantization and save the resulting model in the OpenVINO IR format -quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir) +ov_config = OVConfig(quantization_config=OVQuantizationConfig()) +quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir) # Load the quantized model optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir) ``` diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 1e78c36805..e018134964 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -84,7 +84,7 @@ Here is how to apply static quantization on a fine-tuned DistilBERT given your o ```python from transformers import AutoTokenizer -from optimum.intel import OVQuantizer, OVModelForSequenceClassification, +from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig model_id = "distilbert-base-uncased-finetuned-sst-2-english" model = OVModelForSequenceClassification.from_pretrained(model_id, export=True) @@ -95,7 +95,8 @@ save_dir = "ptq_model" quantizer = OVQuantizer.from_pretrained(model) # Apply static quantization and export the resulting quantized model to OpenVINO IR format -quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir) +ov_config = OVConfig(quantization_config=OVQuantizationConfig()) +quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir) # Save the tokenizer tokenizer.save_pretrained(save_dir) ``` From fb2b2ed452813d67c35f175ded7fa026f022eea7 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 17 Apr 2024 16:19:29 +0200 Subject: [PATCH 3/4] Apply PTQ if quantization config was not provided, but calibration dataset was provided --- optimum/intel/openvino/quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 33985dbe6e..196b779973 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -280,7 +280,7 @@ def quantize( raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") quantization_config = ov_config.quantization_config if quantization_config is None: - if weights_only is None or weights_only is True: + if (weights_only is None or weights_only is True) and calibration_dataset is None: if weights_only is None: logger.info( "`quantization_config` was not provided, 8-bit asymmetric weight quantization will be applied." From 19ee3a705a351d7a4e17f25007fc100fb5fa43b5 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 17 Apr 2024 16:26:48 +0200 Subject: [PATCH 4/4] Add warning --- optimum/intel/openvino/quantization.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 196b779973..9c377520ad 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -287,6 +287,11 @@ def quantize( ) ov_config.quantization_config = OVWeightQuantizationConfig(bits=8) else: + logger.warning( + "`quantization_config` was not provided, but calibration dataset was provided, assuming full " + "model quantization is intended. In the future, please provide `quantization_config` as an " + "instance of OVQuantizationConfig." + ) ov_config.quantization_config = OVQuantizationConfig() if isinstance(self.model, OVBaseModel):