Skip to content

Commit 2b79a0d

Browse files
Merge branch 'main' into nncf-210-update
2 parents f99f767 + 673b88b commit 2b79a0d

31 files changed

+843
-433
lines changed

.github/workflows/test_inc.yml

+10-4
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,17 @@ jobs:
3232
python -m pip install --upgrade pip
3333
pip install cmake
3434
pip install py-cpuinfo
35-
pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
3635
pip install .[neural-compressor,diffusers,tests]
37-
pip install intel-extension-for-pytorch==2.1.100
38-
pip install intel-extension-for-transformers==1.3.2
36+
pip install intel-extension-for-transformers
3937
pip install peft
38+
4039
- name: Test with Pytest
4140
run: |
42-
pytest tests/neural_compressor/
41+
pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
42+
- name: Test IPEX
43+
run: |
44+
pip uninstall -y intel-extension-for-transformers
45+
pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
46+
pip install intel-extension-for-pytorch==2.1.100
47+
pytest tests/neural_compressor/test_ipex.py
48+

.github/workflows/test_openvino.yml

+5-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,11 @@ jobs:
3535
pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
3636
- name: Test with Pytest
3737
run: |
38-
pytest tests/openvino/ --ignore test_modeling_basic --durations=0
38+
pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
39+
- name: Test basic
40+
run: |
41+
pip uninstall -y nncf
42+
pytest tests/openvino/test_modeling_basic.py
3943
- name: Test openvino-nightly
4044
run: |
4145
pip uninstall -y openvino

README.md

+10-3
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a
7878
optimum-cli export openvino --model gpt2 ov_model
7979
```
8080

81-
You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
81+
You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision.
8282

8383
```plain
8484
optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
8585
```
8686

87+
Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers.
88+
89+
```plain
90+
optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model
91+
```
92+
8793
To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
8894

8995
#### Inference:
@@ -122,7 +128,7 @@ Post-training static quantization introduces an additional calibration step wher
122128

123129
```python
124130
from functools import partial
125-
from optimum.intel import OVQuantizer, OVModelForSequenceClassification
131+
from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
126132
from transformers import AutoTokenizer, AutoModelForSequenceClassification
127133

128134
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
@@ -145,7 +151,8 @@ calibration_dataset = quantizer.get_calibration_dataset(
145151
# The directory where the quantized model will be saved
146152
save_dir = "nncf_results"
147153
# Apply static quantization and save the resulting model in the OpenVINO IR format
148-
quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
154+
ov_config = OVConfig(quantization_config=OVQuantizationConfig())
155+
quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
149156
# Load the quantized model
150157
optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
151158
```

docs/source/optimization_ov.mdx

+3-2
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ Here is how to apply static quantization on a fine-tuned DistilBERT given your o
8484

8585
```python
8686
from transformers import AutoTokenizer
87-
from optimum.intel import OVQuantizer, OVModelForSequenceClassification,
87+
from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
8888

8989
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
9090
model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
@@ -95,7 +95,8 @@ save_dir = "ptq_model"
9595
quantizer = OVQuantizer.from_pretrained(model)
9696

9797
# Apply static quantization and export the resulting quantized model to OpenVINO IR format
98-
quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
98+
ov_config = OVConfig(quantization_config=OVQuantizationConfig())
99+
quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
99100
# Save the tokenizer
100101
tokenizer.save_pretrained(save_dir)
101102
```

examples/neural_compressor/language-modeling/run_clm.py

+21-20
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,7 @@
6464

6565

6666
if is_intel_extension_for_transformers_available():
67-
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
68-
67+
from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig
6968

7069
os.environ["CUDA_VISIBLE_DEVICES"] = ""
7170

@@ -227,8 +226,9 @@ class OptimizationArguments:
227226
metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
228227
)
229228
quantization_methodology: str = field(
230-
default="RTN",
231-
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
229+
choices=["rtn", "gptq"],
230+
default="rtn",
231+
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."},
232232
)
233233
damp_percent: float = field(
234234
default=0.01,
@@ -662,22 +662,23 @@ def compute_metrics(eval_preds):
662662
raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization"))
663663
if optim_args.apply_pruning or optim_args.apply_distillation:
664664
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
665-
if optim_args.quantization_methodology == "GPTQ":
666-
algorithm_args = {
667-
"act_order": False,
668-
"percdamp": optim_args.damp_percent,
669-
"block_size": optim_args.gptq_block_size,
670-
"nsamples": optim_args.num_calibration_samples,
671-
"use_max_length": optim_args.use_max_length,
672-
"pad_max_length": optim_args.pad_max_length,
673-
}
674-
quantization_config = WeightOnlyQuantConfig(
675-
weight_dtype=optim_args.weight_dtype,
676-
group_size=optim_args.group_size,
677-
scheme=optim_args.weight_only_scheme,
678-
algorithm=optim_args.quantization_methodology,
679-
algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
680-
)
665+
666+
algorithm_args = {
667+
"weight_dtype": optim_args.weight_dtype,
668+
"sym": optim_args.weight_only_scheme == "sym",
669+
"group_size": optim_args.group_size,
670+
}
671+
672+
if optim_args.quantization_methodology == "gptq":
673+
quantization_config = GPTQConfig(
674+
damp_percent=optim_args.damp_percent,
675+
nsamples=optim_args.num_calibration_samples,
676+
blocksize=optim_args.gptq_block_size,
677+
**algorithm_args,
678+
)
679+
else:
680+
quantization_config = RtnConfig(**algorithm_args)
681+
681682
else:
682683
quantization_config = PostTrainingQuantConfig(
683684
approach=optim_args.quantization_approach, recipes=recipes

notebooks/openvino/question_answering_quantization.ipynb

+7-6
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
"import transformers\n",
5252
"from evaluate import evaluator\n",
5353
"from openvino.runtime import Core\n",
54-
"from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer\n",
54+
"from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n",
5555
"from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n",
5656
"\n",
5757
"transformers.logging.set_verbosity_error()\n",
@@ -286,11 +286,11 @@
286286
"**NOTE:** if you notice very low accuracy after post-training quantization, it is likely caused by an overflow issue which affects processors that do not contain VNNI (Vector Neural Network Instruction). NNCF has an `overflow_fix` option to address this. It will effectively use 7-bits for quantizing instead of 8-bits to prevent the overflow. To use this option, modify the code in the next cell to add an explicit quantization configuration, and set `overflow_fix` to `\"enable\"`:\n",
287287
"\n",
288288
"```\n",
289-
"from optimum.intel.openvino import OVConfig\n",
289+
"from optimum.intel.openvino import OVConfig, OVQuantizationConfig\n",
290290
"\n",
291-
"ov_config = OVConfig()\n",
292-
"ov_config.compression[\"overflow_fix\"] = \"enable\"\n",
293-
"quantizer = OVQuantizer.from_pretrained(model, ov_config=ov_config)\n",
291+
"ov_config = OVConfig(quantization_config=OVQuantizationConfig(overflow_fix=\"enable\")\n",
292+
"quantizer = OVQuantizer.from_pretrained(model)\n",
293+
"quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path, ov_config=ov_config)\n",
294294
"```\n",
295295
"\n",
296296
"For more information, see [Lower Numerical Precision Deep Learning Inference and Training](https://www.intel.com/content/www/us/en/developer/articles/technical/lower-numerical-precision-deep-learning-inference-and-training.html)"
@@ -317,7 +317,8 @@
317317
"\n",
318318
"# Quantize the model\n",
319319
"quantizer = OVQuantizer.from_pretrained(model)\n",
320-
"quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path)"
320+
"ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n",
321+
"quantizer.quantize(calibration_dataset=train_dataset, ov_config=ov_config, save_directory=int8_ptq_model_path)"
321322
]
322323
},
323324
{

optimum/commands/export/openvino.py

+80-16
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from typing import TYPE_CHECKING, Optional
2020

2121
from ...exporters import TasksManager
22+
from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
2223
from ..base import BaseOptimumCLICommand, CommandInfo
2324

2425

@@ -104,6 +105,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
104105
default=None,
105106
help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."),
106107
)
108+
optional_group.add_argument(
109+
"--dataset",
110+
type=str,
111+
default=None,
112+
help=(
113+
"The dataset used for data-aware compression or quantization with NNCF. "
114+
"You can use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs "
115+
"or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
116+
),
117+
)
107118
optional_group.add_argument(
108119
"--disable-stateful",
109120
action="store_true",
@@ -115,10 +126,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
115126
"OpenVINO native inference code that expects kv-cache inputs and outputs in the model."
116127
),
117128
)
129+
optional_group.add_argument(
130+
"--disable-convert-tokenizer",
131+
action="store_true",
132+
help="Do not add converted tokenizer and detokenizer OpenVINO models.",
133+
)
118134
optional_group.add_argument(
119135
"--convert-tokenizer",
120136
action="store_true",
121-
help="Add converted tokenizer and detokenizer with OpenVINO Tokenizers",
137+
help="[Deprecated] Add converted tokenizer and detokenizer with OpenVINO Tokenizers.",
122138
)
123139

124140
optional_group.add_argument(
@@ -195,20 +211,68 @@ def run(self):
195211
)
196212
quantization_config["sym"] = "asym" not in self.args.weight_format
197213
quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
214+
quantization_config["dataset"] = self.args.dataset
198215
ov_config = OVConfig(quantization_config=quantization_config)
199216

200-
# TODO : add input shapes
201-
main_export(
202-
model_name_or_path=self.args.model,
203-
output=self.args.output,
204-
task=self.args.task,
205-
framework=self.args.framework,
206-
cache_dir=self.args.cache_dir,
207-
trust_remote_code=self.args.trust_remote_code,
208-
pad_token_id=self.args.pad_token_id,
209-
ov_config=ov_config,
210-
stateful=not self.args.disable_stateful,
211-
convert_tokenizer=self.args.convert_tokenizer,
212-
library_name=self.args.library
213-
# **input_shapes,
214-
)
217+
library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library)
218+
if library_name == "sentence_transformers" and self.args.library is None:
219+
logger.warning(
220+
"Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`."
221+
"`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers"
222+
)
223+
library_name = "transformers"
224+
225+
if (
226+
library_name == "diffusers"
227+
and ov_config
228+
and ov_config.quantization_config
229+
and ov_config.quantization_config.dataset is not None
230+
):
231+
if not is_diffusers_available():
232+
raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models"))
233+
234+
from diffusers import DiffusionPipeline
235+
236+
diffusers_config = DiffusionPipeline.load_config(self.args.model)
237+
class_name = diffusers_config.get("_class_name", None)
238+
239+
if class_name == "LatentConsistencyModelPipeline":
240+
from optimum.intel import OVLatentConsistencyModelPipeline
241+
242+
model_cls = OVLatentConsistencyModelPipeline
243+
244+
elif class_name == "StableDiffusionXLPipeline":
245+
from optimum.intel import OVStableDiffusionXLPipeline
246+
247+
model_cls = OVStableDiffusionXLPipeline
248+
elif class_name == "StableDiffusionPipeline":
249+
from optimum.intel import OVStableDiffusionPipeline
250+
251+
model_cls = OVStableDiffusionPipeline
252+
else:
253+
raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")
254+
255+
model = model_cls.from_pretrained(
256+
self.args.model, export=True, quantization_config=ov_config.quantization_config
257+
)
258+
model.save_pretrained(self.args.output)
259+
260+
else:
261+
if self.args.convert_tokenizer:
262+
logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
263+
264+
# TODO : add input shapes
265+
main_export(
266+
model_name_or_path=self.args.model,
267+
output=self.args.output,
268+
task=self.args.task,
269+
framework=self.args.framework,
270+
cache_dir=self.args.cache_dir,
271+
trust_remote_code=self.args.trust_remote_code,
272+
pad_token_id=self.args.pad_token_id,
273+
ov_config=ov_config,
274+
stateful=not self.args.disable_stateful,
275+
convert_tokenizer=not self.args.disable_convert_tokenizer,
276+
library_name=library_name,
277+
# **input_shapes,
278+
)

optimum/exporters/openvino/__main__.py

+6-11
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,10 @@
2222
from optimum.exporters import TasksManager
2323
from optimum.exporters.onnx.base import OnnxConfig
2424
from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
25+
from optimum.exporters.openvino.convert import export_from_model, export_tokenizer
26+
from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
2527
from optimum.utils.save_utils import maybe_load_preprocessors
2628

27-
from ...intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
28-
from .convert import export_from_model, export_tokenizer
29-
3029

3130
if TYPE_CHECKING:
3231
from optimum.intel.openvino.configuration import OVConfig
@@ -77,7 +76,7 @@ def main_export(
7776
model_name_or_path (`str`):
7877
Model ID on huggingface.co or path on disk to the model repository to export.
7978
output (`Union[str, Path]`):
80-
Path indicating the directory where to store the generated ONNX model.
79+
Path indicating the directory where to store the generated OpenVINO model.
8180
8281
> Optional parameters
8382
@@ -187,12 +186,6 @@ def main_export(
187186
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
188187
)
189188

190-
if convert_tokenizer and not is_openvino_tokenizers_available():
191-
logger.warning(
192-
"`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`"
193-
)
194-
convert_tokenizer = False
195-
196189
do_gptq_patching = False
197190
custom_architecture = False
198191
loading_kwargs = {}
@@ -348,7 +341,7 @@ class StoreAttr(object):
348341
**kwargs_shapes,
349342
)
350343

351-
if convert_tokenizer:
344+
if convert_tokenizer and is_openvino_tokenizers_available():
352345
if library_name != "diffusers":
353346
tokenizer = next(
354347
(preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)),
@@ -371,6 +364,8 @@ class StoreAttr(object):
371364
tokenizer_2 = getattr(model, "tokenizer_2", None)
372365
if tokenizer_2 is not None:
373366
export_tokenizer(tokenizer_2, output, suffix="_2")
367+
elif convert_tokenizer and not is_openvino_tokenizers_available():
368+
logger.warning("Tokenizer won't be converted.")
374369

375370
# Unpatch modules after GPTQ export
376371
if do_gptq_patching:

0 commit comments

Comments
 (0)