Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit bf4080a

Browse files
committedMar 6, 2024·
apply review comments
1 parent 4f7e87c commit bf4080a

File tree

5 files changed

+85
-79
lines changed

5 files changed

+85
-79
lines changed
 

‎optimum/intel/openvino/configuration.py

+13-15
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
167167
168168
bits (`int`, defaults to 8):
169169
The number of bits to quantize to.
170-
sym (`bool`, *optional*, defaults to `False`):
170+
sym (`bool`, defaults to `False`):
171171
Whether to use symetric quantization.
172172
tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
173173
The tokenizer used to process the dataset. You can pass either:
@@ -177,26 +177,24 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
177177
user or organization name, like `dbmdz/bert-base-german-cased`.
178178
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
179179
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
180-
dataset (`Union[List[str]]`, *optional*):
180+
dataset (`str or List[str]`, *optional*):
181181
The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the
182182
the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs or
183-
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for SD models
184-
group_size (`int`, *optional*, defaults to 128):
185-
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
186-
ratio (`float`, *optional*, defaults to 1.0):
183+
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for SD models.
184+
ratio (`float`, defaults to 1.0):
187185
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
188186
and the rest to INT8_ASYM).
187+
group_size (`int`, *optional*):
188+
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
189189
all_layers (`bool`, *optional*):
190190
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion.
191-
sensitivity_metric (`nncf.SensitivityMetric`, *optional*):
191+
sensitivity_metric (`str`, *optional*):
192192
The sensitivity metric for assigning quantization precision to layers. In order to
193193
preserve the accuracy of the model, the more sensitive layers receives a higher precision.
194-
awq (`bool`, *optional*):
195-
Enables AWQ method to unify weight ranges and improve overall model accuracy.
196-
ignored_scope (`nncf.IgnoredScope`, *optional*):
194+
ignored_scope (`dict`, *optional*):
197195
An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.
198-
subset_size (`int`, *optional*, defaults to 128):
199-
Number of data samples to calculate activation statistics.
196+
num_samples (`int`, *optional*):
197+
The maximum number of samples composing the calibration dataset.
200198
201199
"""
202200

@@ -205,13 +203,13 @@ def __init__(
205203
bits: int = 8,
206204
sym: bool = False,
207205
tokenizer: Optional[Any] = None,
208-
dataset: Optional[str] = None,
206+
dataset: Optional[Union[str, List[str]]] = None,
209207
ratio: float = 1.0,
210208
group_size: Optional[int] = None,
211209
all_layers: Optional[bool] = None,
212210
sensitivity_metric: Optional[str] = None,
213211
ignored_scope: Optional[dict] = None,
214-
subset_size: int = 128,
212+
num_samples: Optional[int] = None,
215213
**kwargs,
216214
):
217215
self.bits = bits
@@ -223,7 +221,7 @@ def __init__(
223221
self.all_layers = all_layers
224222
self.sensitivity_metric = sensitivity_metric
225223
self.ignored_scope = ignored_scope
226-
self.subset_size = subset_size
224+
self.num_samples = num_samples
227225
self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release
228226
self.post_init()
229227

‎optimum/intel/openvino/modeling_decoder.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,8 @@ def _from_pretrained(
635635
# from optimum.gptq.utils import get_seqlen
636636

637637
# seqlen = get_seqlen(causal_model)
638-
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32)
638+
nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
639+
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
639640
dataset = prepare_dataset(dataset)
640641
quantization_config = copy.deepcopy(quantization_config)
641642
quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))

‎optimum/intel/openvino/modeling_diffusion.py

+27-30
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import math
1818
import os
1919
import shutil
20+
from copy import deepcopy
2021
from pathlib import Path
2122
from tempfile import TemporaryDirectory, gettempdir
2223
from typing import Any, Dict, List, Optional, Union
@@ -35,7 +36,6 @@
3536
from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
3637
from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
3738
from huggingface_hub import snapshot_download
38-
from nncf import Dataset
3939
from openvino._offline_transformations import compress_model_transformation
4040
from openvino.runtime import Core
4141
from transformers import CLIPFeatureExtractor, CLIPTokenizer
@@ -276,17 +276,15 @@ def _from_pretrained(
276276
kwargs[name] = load_method(new_model_save_dir)
277277

278278
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
279-
280-
dataset = None
281-
if quantization_config:
282-
dataset = quantization_config.dataset
283-
quantization_config.dataset = None # apply weight compression without dataset
284-
279+
weight_quantization_config = deepcopy(quantization_config)
285280
unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name
286-
if quantization_config and dataset is None:
287-
unet = cls.load_model(unet_path, quantization_config)
288-
else:
281+
if weight_quantization_config is not None and weight_quantization_config.dataset is not None:
282+
# load the UNet model uncompressed to apply hybrid quantization further
289283
unet = cls.load_model(unet_path)
284+
# Apply weights compression to other `components` without dataset
285+
weight_quantization_config.dataset = None
286+
else:
287+
unet = cls.load_model(unet_path, quantization_config)
290288

291289
components = {
292290
"vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
@@ -296,12 +294,12 @@ def _from_pretrained(
296294
}
297295

298296
for key, value in components.items():
299-
components[key] = cls.load_model(value, quantization_config) if value.is_file() else None
297+
components[key] = cls.load_model(value, weight_quantization_config) if value.is_file() else None
300298

301299
if model_save_dir is None:
302300
model_save_dir = new_model_save_dir
303301

304-
if quantization_config and dataset is not None:
302+
if quantization_config and quantization_config.dataset is not None:
305303
sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
306304

307305
supported_pipelines = (
@@ -313,23 +311,23 @@ def _from_pretrained(
313311
raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}")
314312

315313
num_inference_steps = 4 if isinstance(sd_model, OVLatentConsistencyModelPipeline) else 50
316-
quantization_config.dataset = dataset
314+
nsamples = quantization_config.num_samples if quantization_config.num_samples else 200
315+
dataset = deepcopy(quantization_config.dataset)
317316

318-
if isinstance(quantization_config.dataset, str):
317+
if isinstance(dataset, str):
319318
from .quantization import get_stable_diffusion_dataset
320319

321-
dataset_name = quantization_config.dataset
322-
num_samples = math.ceil(quantization_config.subset_size / num_inference_steps)
323-
quantization_config.dataset = get_stable_diffusion_dataset(dataset_name, num_samples)
320+
num_unet_runs = math.ceil(nsamples / num_inference_steps)
321+
dataset = get_stable_diffusion_dataset(dataset, num_unet_runs)
324322

325-
unet_inputs = sd_model.prepare_inputs(
326-
quantization_config.dataset, quantization_config.subset_size, num_inference_steps
327-
)
328-
quantization_config.dataset = unet_inputs
323+
unet_inputs = sd_model._prepare_unet_inputs(dataset, nsamples, num_inference_steps)
329324

330325
from .quantization import _hybrid_quantization
331326

332-
unet = _hybrid_quantization(sd_model.unet.model, quantization_config)
327+
hybrid_quantization_config = deepcopy(quantization_config)
328+
hybrid_quantization_config.dataset = unet_inputs
329+
hybrid_quantization_config.num_samples = nsamples
330+
unet = _hybrid_quantization(sd_model.unet.model, hybrid_quantization_config)
333331

334332
return cls(
335333
unet=unet,
@@ -340,27 +338,26 @@ def _from_pretrained(
340338
**kwargs,
341339
)
342340

343-
def prepare_inputs(
341+
def _prepare_unet_inputs(
344342
self,
345-
dataset: Dataset,
346-
subset_size: int,
343+
dataset: List[str],
344+
num_samples: int,
347345
num_inference_steps: int,
348346
height: Optional[int] = 512,
349347
width: Optional[int] = 512,
350348
**kwargs,
351-
) -> Dataset:
349+
) -> Dict[str, Any]:
352350
self.compile()
353351
calibration_data = []
354352

355353
from .quantization import InferRequestWrapper
356354

357355
self.unet.request = InferRequestWrapper(self.unet.request, calibration_data)
358-
for prompt in dataset.get_inference_data():
356+
for prompt in dataset:
359357
_ = self.__call__(prompt, num_inference_steps=num_inference_steps, height=height, width=width)
360-
if len(calibration_data) >= subset_size:
361-
break
358+
362359
self.unet.request = self.unet.request.request
363-
return Dataset(calibration_data)
360+
return calibration_data[:num_samples]
364361

365362
@classmethod
366363
def _from_transformers(

‎optimum/intel/openvino/quantization.py

+37-27
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from collections import deque
2020
from copy import deepcopy
2121
from pathlib import Path
22-
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
22+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
2323

2424
import nncf
2525
import openvino
@@ -548,7 +548,7 @@ def _remove_unused_columns(self, dataset: "Dataset"):
548548

549549
def _weight_only_quantization(
550550
model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict]
551-
):
551+
) -> openvino.runtime.Model:
552552
config = quantization_config
553553
if isinstance(config, dict):
554554
config = OVWeightQuantizationConfig.from_dict(quantization_config)
@@ -562,7 +562,8 @@ def _weight_only_quantization(
562562

563563
from optimum.gptq.data import get_dataset, prepare_dataset
564564

565-
dataset = get_dataset(config.dataset, tokenizer, seqlen=32)
565+
nsamples = config.num_samples if config.num_samples else 128
566+
dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
566567
dataset = prepare_dataset(dataset)
567568

568569
sensitivity_metric = None
@@ -588,7 +589,7 @@ def _weight_only_quantization(
588589
# awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0
589590
ignored_scope=ignored_scope,
590591
dataset=dataset,
591-
# subset_size=config.subset_size, # TODO : enable from nncf v2.9.0
592+
# subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0
592593
)
593594

594595

@@ -639,7 +640,7 @@ def _collect_ops_with_weights(model):
639640

640641
def get_stable_diffusion_dataset(
641642
dataset_name: str, nsamples: int = 50, seed: int = 0, text_column: str = "caption"
642-
) -> nncf.Dataset:
643+
) -> List[str]:
643644
if dataset_name not in ["conceptual_captions", "laion/220k-GPT4Vision-captions-from-LIVIS", "laion/filtered-wit"]:
644645
raise ValueError(
645646
f"""You have entered a string value for dataset. You can only choose between
@@ -649,37 +650,46 @@ def get_stable_diffusion_dataset(
649650

650651
data = load_dataset(dataset_name, split="train", streaming=True).shuffle(seed=seed).take(nsamples)
651652
dataset = [batch[text_column] for batch in data]
652-
return nncf.Dataset(dataset)
653+
return dataset
653654

654655

655-
def _hybrid_quantization(model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict]):
656-
dataset = quantization_config.dataset
657-
wc_ignored_scope = deepcopy(quantization_config.ignored_scope)
658-
659-
if isinstance(wc_ignored_scope, dict):
660-
wc_ignored_scope["types"] = wc_ignored_scope.get("types", []) + ["Convolution"]
661-
else:
662-
assert wc_ignored_scope is None
663-
wc_ignored_scope = {"types": ["Convolution"]}
656+
def _hybrid_quantization(
657+
model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig
658+
) -> openvino.runtime.Model:
659+
"""
660+
Quantize a model in hybrid mode with NNCF which means that we quantize:
661+
weights of MatMul and Embedding layers and activations of other layers.
662+
The optimization specifications defined in `quantization_config`.
663+
664+
Args:
665+
model (`openvino.runtime.Model`):
666+
The OpenVINO Runtime model for applying hybrid quantization.
667+
quantization_config (`OVWeightQuantizationConfig`):
668+
The configuration containing the parameters related to quantization.
669+
Returns:
670+
The OpenVINO Runtime model with applied hybrid quantization.
671+
"""
672+
ignored_scope = quantization_config.ignored_scope if quantization_config.ignored_scope is not None else {}
664673

665674
ops_to_compress = _collect_ops_with_weights(model)
666-
ptq_ignored_scope = deepcopy(quantization_config.ignored_scope)
667-
if isinstance(ptq_ignored_scope, dict):
668-
ptq_ignored_scope["names"] = ptq_ignored_scope.get("names", []) + ops_to_compress
669-
else:
670-
assert ptq_ignored_scope is None
671-
ptq_ignored_scope = {"names": ops_to_compress}
675+
ptq_ignored_scope = deepcopy(ignored_scope)
676+
ptq_ignored_scope["names"] = ignored_scope.get("names", []) + ops_to_compress
672677

673-
quantization_config.dataset = None # Apply Weight Compression without dataset
674-
quantization_config.ignored_scope = wc_ignored_scope
675-
compressed_model = _weight_only_quantization(model, quantization_config)
678+
wc_quantization_config = deepcopy(quantization_config)
679+
wc_quantization_config.ignored_scope = ignored_scope
680+
wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"]
681+
# Apply Weight Compression without dataset
682+
wc_quantization_config.dataset = None
683+
compressed_model = _weight_only_quantization(model, wc_quantization_config)
676684

685+
subset_size = quantization_config.num_samples if quantization_config.num_samples else 200
677686
quantized_model = nncf.quantize(
678-
compressed_model,
679-
dataset,
687+
model=compressed_model,
688+
calibration_dataset=nncf.Dataset(quantization_config.dataset),
680689
model_type=nncf.ModelType.TRANSFORMER,
681690
ignored_scope=nncf.IgnoredScope(**ptq_ignored_scope),
691+
# The SQ algo should be disabled for MatMul nodes because their weights are already compressed
682692
advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)),
683-
subset_size=quantization_config.subset_size,
693+
subset_size=subset_size,
684694
)
685695
return quantized_model

‎tests/openvino/test_quantization.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
362362
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
363363
def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8):
364364
model_id = MODEL_NAMES[model_type]
365-
quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", subset_size=5)
365+
quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2)
366366
with tempfile.TemporaryDirectory() as tmp_dir:
367367
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
368368

@@ -373,18 +373,18 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f
373373

374374
model.save_pretrained(tmp_dir)
375375

376-
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
376+
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[2:])
377377
def test_ovmodel_hybrid_quantization_with_custom_dataset(
378378
self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8
379379
):
380380
model_id = MODEL_NAMES[model_type]
381-
dataset_name = "daspartho/stable-diffusion-prompts"
382-
dataset = load_dataset(dataset_name, split="train", streaming=True)
383-
quantization_dataset = nncf.Dataset(dataset, lambda x: x["prompt"])
381+
dataset = [
382+
"dream rose covered with clean crystal, sharp edges, transparent, beautiful, highly detailed, high render"
383+
]
384384
model = model_cls.from_pretrained(
385385
model_id,
386386
export=True,
387-
quantization_config=OVWeightQuantizationConfig(bits=8, dataset=quantization_dataset, subset_size=3),
387+
quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3),
388388
)
389389
num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
390390
self.assertEqual(expected_num_fake_quantize, num_fake_quantize)

0 commit comments

Comments
 (0)
Please sign in to comment.