Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip automodel compression weights tests for nncf==2.8.0 #535

Merged
merged 10 commits into from
Feb 8, 2024
Merged
Next Next commit
skip compression weights tests for nncf==2.8.0 and reworked logic of …
…optimization stateful PyTorch models
alexsu52 committed Feb 5, 2024
commit a1fc5ffaf35b0616a06f7654588cf1376b7595b4
24 changes: 3 additions & 21 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
@@ -31,14 +31,7 @@
from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
from optimum.utils import is_diffusers_available

from ...intel.utils.import_utils import (
_torch_version,
_transformers_version,
is_nncf_available,
is_optimum_version,
is_torch_version,
is_transformers_version,
)
from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
from .model_patcher import patch_model_with_bettertransformer
from .stateful import ensure_stateful_is_available, patch_stateful
from .utils import (
@@ -329,19 +322,8 @@ def export_pytorch(
logger.info(f"Using framework PyTorch: {torch.__version__}")
output = Path(output)

if stateful:
if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
COLOR_RED = "\033[1;31m"
COLOR_RESET = "\033[0m"
logger.warning(
COLOR_RED
+ "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
"Consider upgrading PyTorch and Transformers, for example by running "
"`pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]`, and export the model again"
+ COLOR_RESET
)

is_model_stateful = hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True
if stateful and not is_model_stateful:
# Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect
# both of them are applied to demonstrate the best performance.
# TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation.
21 changes: 16 additions & 5 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
@@ -14,16 +14,27 @@

import logging as log

from optimum.intel.utils.import_utils import is_torch_version
from optimum.intel.utils.import_utils import (
is_torch_version,
is_transformers_version,
_torch_version,
_transformers_version,
)


def patch_model_with_bettertransformer(model):
if is_torch_version("<", "2.0"):
if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
COLOR_RED = "\033[1;31m"
COLOR_RESET = "\033[0m"
log.warn(
"integration Scaled Dot Product Attention optimization supported only with torch > 2.0."
"Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention"
"It is recommended to upgrade PyTorch version for using stateful model or use stateful=False"
COLOR_RED
+ "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
"Consider upgrading PyTorch and Transformers, for example by running "
"`pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]`, and export the model again"
+ COLOR_RESET
)

# model already has required SDPA implementation
if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa":
return model
65 changes: 42 additions & 23 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
@@ -24,7 +24,7 @@
import transformers
from accelerate.data_loader import DataLoaderStateMixin
from datasets import Dataset, load_dataset
from nncf import NNCFConfig, compress_weights
from nncf import NNCFConfig
from nncf.torch import create_compressed_model, register_default_init_args, register_module
from nncf.torch.dynamic_graph.io_handling import wrap_nncf_model_inputs_with_objwalk
from nncf.torch.initialization import PTInitializingDataLoader
@@ -34,11 +34,13 @@
from transformers import DataCollator, PreTrainedModel, default_data_collator
from transformers.pytorch_utils import Conv1D

from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
from optimum.exporters.tasks import TasksManager
from optimum.quantization_base import OptimumQuantizer

from ...exporters.openvino import export, export_pytorch_via_onnx
from ...exporters.openvino.stateful import ensure_export_task_support_stateful
from ...exporters.openvino.model_patcher import patch_model_with_bettertransformer
from ...exporters.openvino.stateful import ensure_export_task_support_stateful, ensure_stateful_is_available
from ..utils.constant import _TASK_ALIASES
from .configuration import OVConfig
from .modeling_base import OVBaseModel
@@ -348,9 +350,7 @@ def _quantize_ovcausallm(
self.model.model,
quantization_dataset,
model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"),
fast_bias_correction=True
if not kwargs.get("fast_bias_correction")
else kwargs.get("fast_bias_correction"),
fast_bias_correction=True if not kwargs.get("fast_bias_correction") else kwargs.get("fast_bias_correction"),
**kwargs,
)
self.model.model = quantized_model
@@ -392,13 +392,44 @@ def _quantize_torchmodel(
if file_name is None and quantization_config.save_onnx_model
else Path(ov_file_name).with_suffix(".onnx")
)

task = self.task
model = self.model
self.model.config.save_pretrained(save_directory)
if task.startswith("text-generation"):
onnx_config = onnx_config_class(
model.config, use_past=model.config.use_cache, use_past_in_inputs=model.config.use_cache
)
if model.config.use_cache:
task = "text-generation-with-past"
else:
onnx_config = onnx_config_class(model.config)

stateful = ensure_stateful_is_available() and ensure_export_task_support_stateful(task)

if weights_only:
if getattr(self.model.config, "tie_word_embeddings", True):
# to fix problem with shared embedding weights in nncf compress_weights()
self.model.tie_weights()
compressed_model = compress_weights(self.model)
self.model = compressed_model
from torch.utils._pytree import tree_map

if stateful:
# patch model before weight compression
model = patch_model_with_bettertransformer(model)

dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt")
device = self.model.device
dummy_inputs = tree_map(
lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs
)
check_dummy_inputs_are_allowed(model, dummy_inputs)

nncf.compress_weights(self.model, dataset=nncf.Dataset([dummy_inputs]))
else:
if stateful:
logger.warn(
"Quantization algorithm does not support optimized stateful models. "
"The original model without optimization will be quantized and export."
)
stateful = False

calibration_dataloader = self._get_calibration_dataloader(
calibration_dataset=calibration_dataset,
batch_size=batch_size,
@@ -415,26 +446,14 @@ def _quantize_torchmodel(
)
compressed_model = controller.strip(do_copy=False)

task = self.task
model = self.model
self.model.config.save_pretrained(save_directory)
if task.startswith("text-generation"):
onnx_config = onnx_config_class(
model.config, use_past=model.config.use_cache, use_past_in_inputs=model.config.use_cache
)
if model.config.use_cache:
task = "text-generation-with-past"
else:
onnx_config = onnx_config_class(model.config)

model_path = save_directory / (onnx_file_name if quantization_config.save_onnx_model else ov_file_name)
onnx_path = save_directory / onnx_file_name
export_fn = export if not quantization_config.save_onnx_model else export_pytorch_via_onnx
opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET)
opset = max(opset, MIN_ONNX_QDQ_OPSET)
kwargs = {}
if not quantization_config.save_onnx_model:
kwargs = {"stateful": ensure_export_task_support_stateful(task)}
kwargs = {"stateful": stateful}
_, _, is_onnx = export_fn(model=model, config=onnx_config, output=model_path, opset=opset, **kwargs)
if is_onnx:
# Load and save the compressed model
9 changes: 6 additions & 3 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
@@ -152,9 +152,7 @@ class OVWeightCompressionTest(unittest.TestCase):
)

SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 64, 365),)
SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = (
(OVModelForCausalLM, "opt125m", 64, 477),
)
SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 64, 477),)

SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = (
(OVModelForCausalLM, "gpt2"),
@@ -174,6 +172,11 @@ class OVWeightCompressionTest(unittest.TestCase):

@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS)
def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8):
import nncf

if nncf.__version__ == "2.8.0":
self.skipTest("https://github.com/openvinotoolkit/nncf/issues/2432")

task = model_cls.export_feature

with tempfile.TemporaryDirectory() as tmp_dir: