Add a note about data-aware mixed precision assignment (#1075)

nikita-savelyevv · helena-intel · web-flow · commit f030583f6db0 · 2024-12-17T11:16:19.000+01:00
* Add a note about data-aware mixed precision assignment

* Add a note to dataset parameter

* Update docs/source/openvino/export.mdx

Co-authored-by: Helena Kloosterman &lt;helena.kloosterman@intel.com&gt;

* Add a warning

---------

Co-authored-by: Helena Kloosterman &lt;helena.kloosterman@intel.com&gt;
diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
@@ -78,7 +78,8 @@ Optional arguments:
   --ratio RATIO         A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit
                         quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be
                         quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size
-                        and inference latency. Default value is 1.0.
+                        and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is
+                        less than 1.0, then data-aware mixed precision assignment will be applied.
   --sym                 Whether to apply symmetric quantization
   --group-size GROUP_SIZE
                         The group size to use for quantization. Recommended value is 128 and -1 uses per-column
@@ -94,7 +95,9 @@ Optional arguments:
                         can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
                         be collected from model's generations. For diffusion models it should be on of
                         ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
-                        visual language models the dataset must be set to 'contextual'.
+                        visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware
+                        compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset
+                        argument will not have an effect on the resulting model.
   --all-layers          Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
                         weight compression is applied, they are compressed to INT8.
   --awq                 Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -102,7 +102,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=(
             "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
-            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
+            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
+            "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
         ),
     )
     optional_group.add_argument(
@@ -140,7 +141,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "dataset will be collected from model's generations. "
             "For diffusion models it should be on of ['conceptual_captions',"
             "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
-            "For visual language models the dataset must be set to 'contextual'."
+            "For visual language models the dataset must be set to 'contextual'. "
+            "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
+            "equals 1.0, the dataset argument will not have an effect on the resulting model."
         ),
     )
     optional_group.add_argument(
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -344,6 +344,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
         ratio (`float`, defaults to 1.0):
             The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
             and the rest to INT8_ASYM).
+            Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment
+            will be applied.
         all_layers (`bool`, *optional*):
             Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision.
         sensitivity_metric (`str`, *optional*):
@@ -441,7 +443,7 @@ def post_init(self):
         Safety checker that arguments are correct
         """
         super().post_init()
-        if self.ratio is not None and not (0 <= self.ratio <= 1):
+        if not (0 <= self.ratio <= 1):
             raise ValueError("`ratio` must between 0 and 1.")
         if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
             raise ValueError("`group_size` must be greater than 0 or equal to -1")
@@ -461,6 +463,18 @@ def post_init(self):
                     or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
                 )
 
+        if self.dataset is not None and not (
+            self.quant_method == OVQuantizationMethod.AWQ
+            or self.scale_estimation
+            or self.gptq
+            or self.lora_correction
+            or (self.ratio < 1.0 and self.sensitivity_metric != nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR)
+        ):
+            logger.warning(
+                "The provided dataset won't have any effect on the resulting compressed model because no data-aware "
+                "quantization algorithm is selected and compression ratio is 1.0."
+            )
+
         if self.bits not in [4, 8]:
             raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")