Skip to content

Commit 1275d0a

Browse files
committed
Changed the naming. Added additional tests
1 parent ae1da0f commit 1275d0a

File tree

6 files changed

+52
-23
lines changed

6 files changed

+52
-23
lines changed

optimum/intel/openvino/configuration.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
from optimum.configuration_utils import BaseConfig
2121

22+
from .weight_quantization import OVWeightQuantizationConfig
23+
2224

2325
DEFAULT_QUANTIZATION_CONFIG = {
2426
"algorithm": "quantization",
@@ -84,7 +86,7 @@ def __init__(
8486
compression: Union[List[Dict], Dict, None] = None,
8587
input_info: Optional[List] = None,
8688
save_onnx_model: bool = False,
87-
weight_quantization_config: Optional[QuantizationConfigMixin] = None,
89+
quantization_config: Optional[QuantizationConfigMixin] = None,
8890
**kwargs,
8991
):
9092
super().__init__()
@@ -93,7 +95,7 @@ def __init__(
9395
self.save_onnx_model = save_onnx_model
9496
self._enable_standard_onnx_export_option()
9597
self.optimum_version = kwargs.pop("optimum_version", None)
96-
self.weight_quantization_config = weight_quantization_config
98+
self.quantization_config = quantization_config
9799

98100
def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False):
99101
self.input_info = [
@@ -105,6 +107,11 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False):
105107
for name, value in model_inputs.items()
106108
]
107109

110+
def save_pretrained(self, *args, **kwargs):
111+
if self.quantization_config is None:
112+
self.quantization_config = OVWeightQuantizationConfig()
113+
super().save_pretrained(*args, **kwargs)
114+
108115
def _enable_standard_onnx_export_option(self):
109116
# This method depends on self.save_onnx_model.
110117
# save_onnx_model is defaulted to false so that the final model output is

optimum/intel/openvino/modeling_base.py

+1
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ def _from_transformers(
290290
save_dir = TemporaryDirectory()
291291
save_dir_path = Path(save_dir.name)
292292

293+
# If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
293294
compression_option = None
294295
if load_in_8bit is not None:
295296
compression_option = "fp32"

optimum/intel/openvino/modeling_decoder.py

+1
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ def _from_transformers(
262262
if use_cache:
263263
task = task + "-with-past"
264264

265+
# If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
265266
compression_option = None
266267
if load_in_8bit is not None or load_in_4bit is not None:
267268
compression_option = "fp32"

optimum/intel/openvino/quantization.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
ONNX_WEIGHTS_NAME,
5050
OV_XML_FILE_NAME,
5151
)
52-
from .weight_quantization import compress_decoder_weights
52+
from .weight_quantization import OVWeightQuantizationConfig, compress_decoder_weights
5353

5454

5555
COMPRESSION_OPTIONS = {
@@ -318,12 +318,14 @@ def _quantize_ovcausallm(
318318
save_directory.mkdir(parents=True, exist_ok=True)
319319

320320
if weights_only:
321-
quantization_config = None if ov_config is None else ov_config.weight_quantization_config
321+
quantization_config = None if ov_config is None else ov_config.quantization_config
322322
if quantization_config is None:
323323
# Use default 8-bit compression
324+
quantization_config = OVWeightQuantizationConfig(mode=nncf.CompressWeightsMode.INT8_SYM)
324325
self.model.model = nncf.compress_weights(self.model.model)
325326
else:
326327
compress_decoder_weights(self.model, quantization_config)
328+
327329
self.model.save_pretrained(save_directory)
328330
return
329331

optimum/intel/openvino/weight_quantization.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
5959

6060
def __init__(
6161
self,
62-
mode=nncf.CompressWeightsMode.INT4_ASYM,
62+
mode=None,
6363
tokenizer: Any = None,
6464
dataset: Optional[Union[nncf.Dataset, str]] = None,
6565
ratio: Optional[float] = None,

tests/openvino/test_quantization.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ class OVWeightCompressionTest(unittest.TestCase):
155155
)
156156

157157
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 64, 365),)
158+
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 6, 379),)
158159
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS = (
159160
(OVModelForCausalLM, "hf-internal-testing/tiny-random-OPTForCausalLM", 16, 136),
160161
)
@@ -287,9 +288,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i
287288

288289
quantizer = OVQuantizer.from_pretrained(transformers_model, task=task)
289290
ov_config = OVConfig(
290-
weight_quantization_config=OVWeightQuantizationConfig(
291-
mode=nncf.CompressWeightsMode.INT4_SYM, ratio=0.8
292-
)
291+
quantization_config=OVWeightQuantizationConfig(mode=nncf.CompressWeightsMode.INT4_SYM, ratio=0.8)
293292
)
294293
quantizer.quantize(
295294
save_directory=tmp_dir,
@@ -330,25 +329,43 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp
330329

331330
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
332331
def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
333-
model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False)
332+
with tempfile.TemporaryDirectory() as tmp_dir:
333+
model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False)
334+
335+
if model.export_feature.startswith("text2text-generation"):
336+
models = [model.encoder, model.decoder, model.decoder_with_past]
337+
elif model.export_feature.startswith("stable-diffusion"):
338+
models = [model.unet, model.vae_encoder, model.vae_decoder]
339+
models.append(
340+
model.text_encoder if model.export_feature == "stable-diffusion" else model.text_encoder_2
341+
)
342+
else:
343+
models = [model]
334344

335-
if model.export_feature.startswith("text2text-generation"):
336-
models = [model.encoder, model.decoder, model.decoder_with_past]
337-
elif model.export_feature.startswith("stable-diffusion"):
338-
models = [model.unet, model.vae_encoder, model.vae_decoder]
339-
models.append(model.text_encoder if model.export_feature == "stable-diffusion" else model.text_encoder_2)
340-
else:
341-
models = [model]
345+
expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
346+
for i, model in enumerate(models):
347+
_, num_int8, _ = get_num_quantized_nodes(model)
348+
self.assertEqual(expected_ov_int8[i], num_int8)
349+
model.save_pretrained(tmp_dir)
342350

343-
expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
344-
for i, model in enumerate(models):
345-
_, num_int8, _ = get_num_quantized_nodes(model)
346-
self.assertEqual(expected_ov_int8[i], num_int8)
351+
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS)
352+
def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_int8, expected_ov_int4):
353+
with tempfile.TemporaryDirectory() as tmp_dir:
354+
model_id = MODEL_NAMES[model_type]
355+
model = model_cls.from_pretrained(model_id, export=True, load_in_4bit=True)
356+
tokenizer = AutoTokenizer.from_pretrained(model_id)
357+
if tokenizer.pad_token is None:
358+
tokenizer.pad_token = tokenizer.eos_token
347359

348-
@parameterized.expand(LOAD_IN_4_BITS_SCOPE)
349-
def test_ovmodel_4bit_auto_compression(self, model_cls, model_id, quantization_config, expected_ov_int4):
350-
task = model_cls.export_feature
360+
_, num_int8, num_int4 = get_num_quantized_nodes(model)
361+
self.assertEqual(expected_ov_int4, num_int4)
362+
self.assertEqual(expected_ov_int8, num_int8)
363+
model.save_pretrained(tmp_dir)
351364

365+
@parameterized.expand(LOAD_IN_4_BITS_SCOPE)
366+
def test_ovmodel_4bit_auto_compression_with_config(
367+
self, model_cls, model_id, quantization_config, expected_ov_int4
368+
):
352369
with tempfile.TemporaryDirectory() as tmp_dir:
353370
model = model_cls.from_pretrained(
354371
model_id, export=True, load_in_4bit=True, quantization_config=quantization_config
@@ -359,6 +376,7 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_id, quantization_c
359376

360377
_, num_int4, _ = get_num_quantized_nodes(model)
361378
self.assertEqual(expected_ov_int4, num_int4)
379+
model.save_pretrained(tmp_dir)
362380

363381
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS)
364382
def test_ovmodel_4bit_auto_compression_with_custom_dataset(

0 commit comments

Comments
 (0)