Skip to content

Commit f030583

Browse files
Add a note about data-aware mixed precision assignment (#1075)
* Add a note about data-aware mixed precision assignment * Add a note to dataset parameter * Update docs/source/openvino/export.mdx Co-authored-by: Helena Kloosterman <helena.kloosterman@intel.com> * Add a warning --------- Co-authored-by: Helena Kloosterman <helena.kloosterman@intel.com>
1 parent 7601bfd commit f030583

File tree

3 files changed

+25
-5
lines changed

3 files changed

+25
-5
lines changed

docs/source/openvino/export.mdx

+5-2
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ Optional arguments:
7878
--ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit
7979
quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be
8080
quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size
81-
and inference latency. Default value is 1.0.
81+
and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is
82+
less than 1.0, then data-aware mixed precision assignment will be applied.
8283
--sym Whether to apply symmetric quantization
8384
--group-size GROUP_SIZE
8485
The group size to use for quantization. Recommended value is 128 and -1 uses per-column
@@ -94,7 +95,9 @@ Optional arguments:
9495
can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
9596
be collected from model's generations. For diffusion models it should be on of
9697
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
97-
visual language models the dataset must be set to 'contextual'.
98+
visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware
99+
compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset
100+
argument will not have an effect on the resulting model.
98101
--all-layers Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
99102
weight compression is applied, they are compressed to INT8.
100103
--awq Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but

optimum/commands/export/openvino.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
102102
default=None,
103103
help=(
104104
"A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
105-
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
105+
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
106+
"Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
106107
),
107108
)
108109
optional_group.add_argument(
@@ -140,7 +141,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
140141
"dataset will be collected from model's generations. "
141142
"For diffusion models it should be on of ['conceptual_captions',"
142143
"'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
143-
"For visual language models the dataset must be set to 'contextual'."
144+
"For visual language models the dataset must be set to 'contextual'. "
145+
"Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
146+
"equals 1.0, the dataset argument will not have an effect on the resulting model."
144147
),
145148
)
146149
optional_group.add_argument(

optimum/intel/openvino/configuration.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
344344
ratio (`float`, defaults to 1.0):
345345
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
346346
and the rest to INT8_ASYM).
347+
Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment
348+
will be applied.
347349
all_layers (`bool`, *optional*):
348350
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision.
349351
sensitivity_metric (`str`, *optional*):
@@ -441,7 +443,7 @@ def post_init(self):
441443
Safety checker that arguments are correct
442444
"""
443445
super().post_init()
444-
if self.ratio is not None and not (0 <= self.ratio <= 1):
446+
if not (0 <= self.ratio <= 1):
445447
raise ValueError("`ratio` must between 0 and 1.")
446448
if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
447449
raise ValueError("`group_size` must be greater than 0 or equal to -1")
@@ -461,6 +463,18 @@ def post_init(self):
461463
or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
462464
)
463465

466+
if self.dataset is not None and not (
467+
self.quant_method == OVQuantizationMethod.AWQ
468+
or self.scale_estimation
469+
or self.gptq
470+
or self.lora_correction
471+
or (self.ratio < 1.0 and self.sensitivity_metric != nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR)
472+
):
473+
logger.warning(
474+
"The provided dataset won't have any effect on the resulting compressed model because no data-aware "
475+
"quantization algorithm is selected and compression ratio is 1.0."
476+
)
477+
464478
if self.bits not in [4, 8]:
465479
raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")
466480

0 commit comments

Comments
 (0)