Skip to content

Commit 80e9bf6

Browse files
authored
Fix openvino quantization config (huggingface#773)
* enable string quant method * fix * fix docstrings * format * awq should be set to None for int8 quantization
1 parent c19723e commit 80e9bf6

File tree

4 files changed

+25
-12
lines changed

4 files changed

+25
-12
lines changed

optimum/commands/export/openvino.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from typing import TYPE_CHECKING, Optional
2121

2222
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
23-
from transformers.utils.quantization_config import QuantizationMethod
2423

2524
from ...exporters import TasksManager
2625
from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
@@ -289,7 +288,7 @@ def _get_default_int4_config(model_id_or_path, library_name):
289288
"all_layers": None if is_int8 else self.args.all_layers,
290289
"dataset": self.args.dataset,
291290
"num_samples": self.args.num_samples,
292-
"quant_method": QuantizationMethod.AWQ if self.args.awq else None,
291+
"quant_method": "awq" if self.args.awq else "default",
293292
"sensitivity_metric": self.args.sensitivity_metric,
294293
"scale_estimation": self.args.scale_estimation,
295294
}

optimum/intel/openvino/configuration.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import torch
2222
from transformers import PretrainedConfig
23-
from transformers.utils.quantization_config import QuantizationConfigMixin, QuantizationMethod
23+
from transformers.utils.quantization_config import QuantizationConfigMixin
2424

2525
from optimum.configuration_utils import BaseConfig
2626

@@ -78,6 +78,7 @@
7878
class OVQuantizationMethod(str, Enum):
7979
DEFAULT = "default"
8080
HYBRID = "hybrid"
81+
AWQ = "awq"
8182

8283

8384
@dataclass
@@ -171,7 +172,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
171172
entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class.
172173
num_samples (`int`, *optional*):
173174
The maximum number of samples composing the calibration dataset.
174-
quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT):
175+
quant_method (`str or OVQuantizationMethod`, defaults of OVQuantizationMethod.DEFAULT):
175176
Weight compression method to apply. Possible options:
176177
- "default": default weight quantization will be applied.
177178
- "awq": compressed weights will be computed according to the Activation-Aware-Quantization (AWQ)
@@ -199,7 +200,7 @@ def __init__(
199200
sensitivity_metric: Optional[str] = None,
200201
ignored_scope: Optional[dict] = None,
201202
num_samples: Optional[int] = None,
202-
quant_method: Union[QuantizationMethod, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT,
203+
quant_method: Union[str, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT,
203204
scale_estimation: bool = None,
204205
**kwargs,
205206
):
@@ -210,7 +211,7 @@ def __init__(
210211
self.ratio = ratio
211212
self.all_layers = all_layers
212213
self.sensitivity_metric = sensitivity_metric
213-
self.quant_method = quant_method
214+
self.quant_method = OVQuantizationMethod(quant_method) if isinstance(quant_method, str) else quant_method
214215
self.scale_estimation = scale_estimation
215216
self.post_init()
216217

optimum/intel/openvino/quantization.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
3939
from transformers.pytorch_utils import Conv1D
4040
from transformers.utils import is_accelerate_available
41-
from transformers.utils.quantization_config import QuantizationMethod
4241

4342
from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
4443
from optimum.exporters.tasks import TasksManager
@@ -828,7 +827,7 @@ def _weight_only_quantization(
828827
group_size=config.group_size,
829828
all_layers=config.all_layers,
830829
sensitivity_metric=sensitivity_metric,
831-
awq=config.quant_method == QuantizationMethod.AWQ or None,
830+
awq=getattr(config.quant_method, "name", "") == "AWQ" or None,
832831
ignored_scope=config.get_ignored_scope_instance(),
833832
dataset=dataset,
834833
subset_size=config.num_samples if config.num_samples else 128,

tests/openvino/test_quantization.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,20 @@ class OVWeightCompressionTest(unittest.TestCase):
235235
),
236236
16,
237237
),
238+
(
239+
OVModelForCausalLM,
240+
"llama_awq",
241+
dict(
242+
bits=4,
243+
sym=True,
244+
group_size=16,
245+
ratio=0.8,
246+
sensitivity_metric="mean_activation_magnitude",
247+
dataset="c4",
248+
quant_method="awq",
249+
),
250+
16,
251+
),
238252
)
239253

240254
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = (
@@ -413,9 +427,9 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset(
413427
]
414428
model = model_cls.from_pretrained(model_id, export=True)
415429
quantizer = OVQuantizer(model)
416-
quantization_config = OVWeightQuantizationConfig(
417-
bits=8, num_samples=3, quant_method=OVQuantizationMethod.HYBRID
418-
)
430+
quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=3, quant_method="hybrid")
431+
self.assertIsInstance(quantization_config.quant_method, OVQuantizationMethod.HYBRID)
432+
419433
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset)
420434
num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
421435
self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
@@ -454,7 +468,7 @@ def test_ovmodel_4bit_auto_compression_with_config(
454468
with tempfile.TemporaryDirectory() as tmp_dir:
455469
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
456470
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
457-
if quantization_config.quant_method == QuantizationMethod.AWQ or quantization_config.scale_estimation:
471+
if quantization_config.quant_method.lower() == "awq" or quantization_config.scale_estimation:
458472
# TODO: Check that AWQ and SE was actually applied
459473
pass
460474

0 commit comments

Comments
 (0)