Skip to content

Commit f3b8ce8

Browse files
authored
Merge branch 'main' into openvino_tokenizers
2 parents 09b067f + 1c14957 commit f3b8ce8

30 files changed

+1006
-489
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,10 @@ It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2
7878
optimum-cli export openvino --model gpt2 ov_model
7979
```
8080

81-
If you add `--int8`, the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
81+
You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
8282

8383
```plain
84-
optimum-cli export openvino --model gpt2 --int8 ov_model
84+
optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
8585
```
8686

8787
To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).

optimum/exporters/openvino/convert.py

+2-20
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,7 @@
3333
from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
3434
from optimum.utils import is_diffusers_available
3535

36-
from ...intel.utils.import_utils import (
37-
_torch_version,
38-
_transformers_version,
39-
is_nncf_available,
40-
is_optimum_version,
41-
is_torch_version,
42-
is_transformers_version,
43-
)
36+
from ...intel.utils.import_utils import is_nncf_available, is_optimum_version
4437
from .model_patcher import patch_model_with_bettertransformer
4538
from .stateful import ensure_stateful_is_available, patch_stateful
4639
from .utils import (
@@ -97,6 +90,7 @@ def _save_model(model, path: str, compression_option: Optional[str] = None, comp
9790
"ratio": compression_ratio,
9891
},
9992
}
93+
10094
model = nncf.compress_weights(model, **COMPRESSION_OPTIONS[compression_option])
10195

10296
compress_to_fp16 = compression_option == "fp16"
@@ -332,18 +326,6 @@ def export_pytorch(
332326
output = Path(output)
333327

334328
if stateful:
335-
if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
336-
COLOR_RED = "\033[1;31m"
337-
COLOR_RESET = "\033[0m"
338-
logger.warning(
339-
COLOR_RED
340-
+ "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
341-
f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
342-
"Consider upgrading PyTorch and Transformers, for example by running "
343-
"`pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]`, and export the model again"
344-
+ COLOR_RESET
345-
)
346-
347329
# Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect
348330
# both of them are applied to demonstrate the best performance.
349331
# TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation.

optimum/exporters/openvino/model_patcher.py

+20-5
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,31 @@
1414

1515
import logging as log
1616

17-
from optimum.intel.utils.import_utils import is_torch_version
17+
from optimum.intel.utils.import_utils import (
18+
_torch_version,
19+
_transformers_version,
20+
is_torch_version,
21+
is_transformers_version,
22+
)
1823

1924

2025
def patch_model_with_bettertransformer(model):
21-
if is_torch_version("<", "2.0"):
26+
# check that the model has not yet been pathced
27+
if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
28+
return model
29+
30+
if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
31+
COLOR_RED = "\033[1;31m"
32+
COLOR_RESET = "\033[0m"
2233
log.warn(
23-
"integration Scaled Dot Product Attention optimization supported only with torch > 2.0."
24-
"Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention"
25-
"It is recommended to upgrade PyTorch version for using stateful model or use stateful=False"
34+
COLOR_RED
35+
+ "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
36+
f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
37+
"Consider upgrading PyTorch and Transformers, for example by running "
38+
"`pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]`, and export the model again"
39+
+ COLOR_RESET
2640
)
41+
2742
# model already has required SDPA implementation
2843
if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa":
2944
return model

optimum/exporters/openvino/stateful.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from openvino.runtime import opset13
2323
from optimum.exporters import TasksManager
2424
from optimum.intel.utils.import_utils import _openvino_version, is_openvino_version
25-
from optimum.utils.normalized_config import NormalizedConfigManager
2625

2726

2827
def model_has_state(ov_model: ov.Model):
@@ -217,9 +216,7 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model):
217216
batch_dim = 1 if config.model_type == "chatglm" else 0
218217

219218
fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)
220-
221-
normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
222-
num_attention_heads = normalized_config.num_attention_heads if config.model_type == "bloom" else 1
219+
num_attention_heads = config.num_attention_heads if config.model_type == "bloom" else 1
223220
make_stateful(
224221
ov_model, not_kv_inputs, key_value_input_names, key_value_output_names, batch_dim, num_attention_heads, None
225222
)

optimum/intel/__init__.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,11 @@
4848
"IPEXModelForMaskedLM",
4949
"IPEXModelForTokenClassification",
5050
"IPEXModelForQuestionAnswering",
51+
"IPEXModelForImageClassification",
52+
"IPEXModelForAudioClassification",
53+
"IPEXModel",
5154
]
5255

53-
5456
try:
5557
if not (is_openvino_available() and is_nncf_available()):
5658
raise OptionalDependencyNotAvailable()
@@ -60,9 +62,12 @@
6062
"OVQuantizer",
6163
"OVTrainer",
6264
"OVTrainingArguments",
65+
"OVWeightQuantizationConfig",
6366
]
6467
else:
65-
_import_structure["openvino"].extend(["OVConfig", "OVQuantizer", "OVTrainer", "OVTrainingArguments"])
68+
_import_structure["openvino"].extend(
69+
["OVConfig", "OVQuantizer", "OVTrainer", "OVTrainingArguments", "OVWeightQuantizationConfig"]
70+
)
6671

6772
try:
6873
if not (is_openvino_available() and is_diffusers_available()):
@@ -159,7 +164,10 @@
159164
from .utils.dummy_ipex_objects import *
160165
else:
161166
from .ipex import (
167+
IPEXModel,
168+
IPEXModelForAudioClassification,
162169
IPEXModelForCausalLM,
170+
IPEXModelForImageClassification,
163171
IPEXModelForMaskedLM,
164172
IPEXModelForQuestionAnswering,
165173
IPEXModelForSequenceClassification,
@@ -171,9 +179,15 @@
171179
if not (is_openvino_available() and is_nncf_available()):
172180
raise OptionalDependencyNotAvailable()
173181
except OptionalDependencyNotAvailable:
174-
from .utils.dummy_openvino_and_nncf_objects import OVConfig, OVQuantizer, OVTrainer, OVTrainingArguments
182+
from .utils.dummy_openvino_and_nncf_objects import (
183+
OVConfig,
184+
OVQuantizer,
185+
OVTrainer,
186+
OVTrainingArguments,
187+
OVWeightQuantizationConfig,
188+
)
175189
else:
176-
from .openvino import OVConfig, OVQuantizer, OVTrainer, OVTrainingArguments
190+
from .openvino import OVConfig, OVQuantizer, OVTrainer, OVTrainingArguments, OVWeightQuantizationConfig
177191

178192
try:
179193
if not (is_openvino_available() and is_diffusers_available()):

optimum/intel/generation/modeling.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,11 @@ def prepare_jit_inputs(model: PreTrainedModel, task: str, use_cache: bool = Fals
6666

6767
def jit_trace(model: PreTrainedModel, task: str, use_cache: bool = False):
6868
model_inputs = prepare_jit_inputs(model, task, use_cache)
69-
model.config.return_dict = False
69+
model.config.return_dict = task not in {"text-generation", "audio-classification"}
7070
# check if the model_inputs is correct.
7171
model(**model_inputs)
7272

7373
torch._C._jit_set_texpr_fuser_enabled(False)
74-
if "past_key_values" in model_inputs.keys():
75-
model.config.return_dict = False
7674
if is_torch_version(">=", "2.1.0"):
7775
traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False)
7876
else:

optimum/intel/ipex/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from optimum.intel.ipex.modeling_base import (
2+
IPEXModel,
3+
IPEXModelForAudioClassification,
24
IPEXModelForCausalLM,
5+
IPEXModelForImageClassification,
36
IPEXModelForMaskedLM,
47
IPEXModelForQuestionAnswering,
58
IPEXModelForSequenceClassification,

optimum/intel/ipex/inference.py

+1-19
Original file line numberDiff line numberDiff line change
@@ -31,25 +31,13 @@
3131
IPEXModelForMaskedLM,
3232
IPEXModelForSequenceClassification,
3333
IPEXModelForTokenClassification,
34-
IPEXBloomForCausalLM,
35-
IPEXMPTForCausalLM,
36-
IPEXOPTForCausalLM,
37-
IPEXGPTBigCodeForCausalLM,
3834
IPEXModelForQuestionAnswering,
3935
)
4036

4137

4238
from .utils import _HEAD_TO_AUTOMODELS
4339

4440

45-
_MODEL_TYPE_TO_AUTOMODELS = {
46-
"bloom": IPEXBloomForCausalLM,
47-
"mpt": IPEXMPTForCausalLM,
48-
"opt": IPEXOPTForCausalLM,
49-
"big_code": IPEXGPTBigCodeForCausalLM,
50-
}
51-
52-
5341
logger = logging.getLogger(__name__)
5442

5543
IPEX_NOT_AVAILABLE_ERROR_MSG = (
@@ -146,13 +134,7 @@ def __enter__(self):
146134
)
147135
if task in _HEAD_TO_AUTOMODELS:
148136
model = jit_trace(model, task, use_cache)
149-
model_type = getattr(self._original.config, "model_type", "").replace("_", "-")
150-
151-
if task == "text-generation" and model_type in _MODEL_TYPE_TO_AUTOMODELS.keys():
152-
auto_model_class = _MODEL_TYPE_TO_AUTOMODELS[task]
153-
else:
154-
auto_model_class = eval(_HEAD_TO_AUTOMODELS[task])
155-
137+
auto_model_class = eval(_HEAD_TO_AUTOMODELS[task])
156138
model = auto_model_class(model, self._original.config, use_cache=use_cache)
157139

158140
# Enable automatic mixed precision (AMP) if we are going to target `bfloat16`

0 commit comments

Comments
 (0)