Skip to content

Commit c9ad544

Browse files
committed
skip compression weights tests for nncf==2.8.0 and reworked logic of optimization stateful PyTorch models
1 parent 87b36db commit c9ad544

File tree

4 files changed

+67
-52
lines changed

4 files changed

+67
-52
lines changed

optimum/exporters/openvino/convert.py

+3-21
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,7 @@
3131
from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
3232
from optimum.utils import is_diffusers_available
3333

34-
from ...intel.utils.import_utils import (
35-
_torch_version,
36-
_transformers_version,
37-
is_nncf_available,
38-
is_optimum_version,
39-
is_torch_version,
40-
is_transformers_version,
41-
)
34+
from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
4235
from .model_patcher import patch_model_with_bettertransformer
4336
from .stateful import ensure_stateful_is_available, patch_stateful
4437
from .utils import (
@@ -329,19 +322,8 @@ def export_pytorch(
329322
logger.info(f"Using framework PyTorch: {torch.__version__}")
330323
output = Path(output)
331324

332-
if stateful:
333-
if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
334-
COLOR_RED = "\033[1;31m"
335-
COLOR_RESET = "\033[0m"
336-
logger.warning(
337-
COLOR_RED
338-
+ "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
339-
f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
340-
"Consider upgrading PyTorch and Transformers, for example by running "
341-
"`pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]`, and export the model again"
342-
+ COLOR_RESET
343-
)
344-
325+
is_model_stateful = hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True
326+
if stateful and not is_model_stateful:
345327
# Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect
346328
# both of them are applied to demonstrate the best performance.
347329
# TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation.

optimum/exporters/openvino/model_patcher.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,27 @@
1414

1515
import logging as log
1616

17-
from optimum.intel.utils.import_utils import is_torch_version
17+
from optimum.intel.utils.import_utils import (
18+
is_torch_version,
19+
is_transformers_version,
20+
_torch_version,
21+
_transformers_version,
22+
)
1823

1924

2025
def patch_model_with_bettertransformer(model):
21-
if is_torch_version("<", "2.0"):
26+
if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
27+
COLOR_RED = "\033[1;31m"
28+
COLOR_RESET = "\033[0m"
2229
log.warn(
23-
"integration Scaled Dot Product Attention optimization supported only with torch > 2.0."
24-
"Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention"
25-
"It is recommended to upgrade PyTorch version for using stateful model or use stateful=False"
30+
COLOR_RED
31+
+ "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
32+
f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
33+
"Consider upgrading PyTorch and Transformers, for example by running "
34+
"`pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]`, and export the model again"
35+
+ COLOR_RESET
2636
)
37+
2738
# model already has required SDPA implementation
2839
if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa":
2940
return model

optimum/intel/openvino/quantization.py

+42-23
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import transformers
2525
from accelerate.data_loader import DataLoaderStateMixin
2626
from datasets import Dataset, load_dataset
27-
from nncf import NNCFConfig, compress_weights
27+
from nncf import NNCFConfig
2828
from nncf.torch import create_compressed_model, register_default_init_args, register_module
2929
from nncf.torch.dynamic_graph.io_handling import wrap_nncf_model_inputs_with_objwalk
3030
from nncf.torch.initialization import PTInitializingDataLoader
@@ -34,11 +34,13 @@
3434
from transformers import DataCollator, PreTrainedModel, default_data_collator
3535
from transformers.pytorch_utils import Conv1D
3636

37+
from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
3738
from optimum.exporters.tasks import TasksManager
3839
from optimum.quantization_base import OptimumQuantizer
3940

4041
from ...exporters.openvino import export, export_pytorch_via_onnx
41-
from ...exporters.openvino.stateful import ensure_export_task_support_stateful
42+
from ...exporters.openvino.model_patcher import patch_model_with_bettertransformer
43+
from ...exporters.openvino.stateful import ensure_export_task_support_stateful, ensure_stateful_is_available
4244
from ..utils.constant import _TASK_ALIASES
4345
from .configuration import OVConfig
4446
from .modeling_base import OVBaseModel
@@ -344,9 +346,7 @@ def __getattr__(self, attr):
344346
self.model.model,
345347
quantization_dataset,
346348
model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"),
347-
fast_bias_correction=True
348-
if not kwargs.get("fast_bias_correction")
349-
else kwargs.get("fast_bias_correction"),
349+
fast_bias_correction=True if not kwargs.get("fast_bias_correction") else kwargs.get("fast_bias_correction"),
350350
**kwargs,
351351
)
352352
self.model.model = quantized_model
@@ -388,13 +388,44 @@ def _quantize_torchmodel(
388388
if file_name is None and quantization_config.save_onnx_model
389389
else Path(ov_file_name).with_suffix(".onnx")
390390
)
391+
392+
task = self.task
393+
model = self.model
394+
self.model.config.save_pretrained(save_directory)
395+
if task.startswith("text-generation"):
396+
onnx_config = onnx_config_class(
397+
model.config, use_past=model.config.use_cache, use_past_in_inputs=model.config.use_cache
398+
)
399+
if model.config.use_cache:
400+
task = "text-generation-with-past"
401+
else:
402+
onnx_config = onnx_config_class(model.config)
403+
404+
stateful = ensure_stateful_is_available() and ensure_export_task_support_stateful(task)
405+
391406
if weights_only:
392-
if getattr(self.model.config, "tie_word_embeddings", True):
393-
# to fix problem with shared embedding weights in nncf compress_weights()
394-
self.model.tie_weights()
395-
compressed_model = compress_weights(self.model)
396-
self.model = compressed_model
407+
from torch.utils._pytree import tree_map
408+
409+
if stateful:
410+
# patch model before weight compression
411+
model = patch_model_with_bettertransformer(model)
412+
413+
dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt")
414+
device = self.model.device
415+
dummy_inputs = tree_map(
416+
lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs
417+
)
418+
check_dummy_inputs_are_allowed(model, dummy_inputs)
419+
420+
nncf.compress_weights(self.model, dataset=nncf.Dataset([dummy_inputs]))
397421
else:
422+
if stateful:
423+
logger.warn(
424+
"Quantization algorithm does not support optimized stateful models. "
425+
"The original model without optimization will be quantized and export."
426+
)
427+
stateful = False
428+
398429
calibration_dataloader = self._get_calibration_dataloader(
399430
calibration_dataset=calibration_dataset,
400431
batch_size=batch_size,
@@ -411,26 +442,14 @@ def _quantize_torchmodel(
411442
)
412443
compressed_model = controller.strip(do_copy=False)
413444

414-
task = self.task
415-
model = self.model
416-
self.model.config.save_pretrained(save_directory)
417-
if task.startswith("text-generation"):
418-
onnx_config = onnx_config_class(
419-
model.config, use_past=model.config.use_cache, use_past_in_inputs=model.config.use_cache
420-
)
421-
if model.config.use_cache:
422-
task = "text-generation-with-past"
423-
else:
424-
onnx_config = onnx_config_class(model.config)
425-
426445
model_path = save_directory / (onnx_file_name if quantization_config.save_onnx_model else ov_file_name)
427446
onnx_path = save_directory / onnx_file_name
428447
export_fn = export if not quantization_config.save_onnx_model else export_pytorch_via_onnx
429448
opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET)
430449
opset = max(opset, MIN_ONNX_QDQ_OPSET)
431450
kwargs = {}
432451
if not quantization_config.save_onnx_model:
433-
kwargs = {"stateful": ensure_export_task_support_stateful(task)}
452+
kwargs = {"stateful": stateful}
434453
_, _, is_onnx = export_fn(model=model, config=onnx_config, output=model_path, opset=opset, **kwargs)
435454
if is_onnx:
436455
# Load and save the compressed model

tests/openvino/test_quantization.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,7 @@ class OVWeightCompressionTest(unittest.TestCase):
152152
)
153153

154154
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 64, 365),)
155-
SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = (
156-
(OVModelForCausalLM, "opt125m", 64, 477),
157-
)
155+
SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 64, 477),)
158156

159157
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = (
160158
(OVModelForCausalLM, "gpt2"),
@@ -174,6 +172,11 @@ class OVWeightCompressionTest(unittest.TestCase):
174172

175173
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS)
176174
def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8):
175+
import nncf
176+
177+
if nncf.__version__ == "2.8.0":
178+
self.skipTest("https://github.com/openvinotoolkit/nncf/issues/2432")
179+
177180
task = model_cls.export_feature
178181

179182
with tempfile.TemporaryDirectory() as tmp_dir:

0 commit comments

Comments
 (0)