Skip to content

Commit ba4f195

Browse files
committed
Add hybrid quantization for StableDiffusion pipelines
1 parent 5e319aa commit ba4f195

File tree

5 files changed

+223
-8
lines changed

5 files changed

+223
-8
lines changed

optimum/intel/openvino/configuration.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,8 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
179179
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
180180
dataset (`Union[List[str]]`, *optional*):
181181
The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the
182-
the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new']
182+
the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs or
183+
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for SD models
183184
group_size (`int`, *optional*, defaults to 128):
184185
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
185186
ratio (`float`, *optional*, defaults to 1.0):
@@ -194,6 +195,8 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
194195
Enables AWQ method to unify weight ranges and improve overall model accuracy.
195196
ignored_scope (`nncf.IgnoredScope`, *optional*):
196197
An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.
198+
subset_size (`int`, *optional*, defaults to 128):
199+
Number of data samples to calculate activation statistics.
197200
198201
"""
199202

@@ -208,6 +211,7 @@ def __init__(
208211
all_layers: Optional[bool] = None,
209212
sensitivity_metric: Optional[str] = None,
210213
ignored_scope: Optional[dict] = None,
214+
subset_size: int = 128,
211215
**kwargs,
212216
):
213217
self.bits = bits
@@ -219,6 +223,7 @@ def __init__(
219223
self.all_layers = all_layers
220224
self.sensitivity_metric = sensitivity_metric
221225
self.ignored_scope = ignored_scope
226+
self.subset_size = subset_size
222227
self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release
223228
self.post_init()
224229

@@ -231,10 +236,16 @@ def post_init(self):
231236
if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
232237
raise ValueError("`group_size` must be greater than 0 or equal to -1")
233238
if self.dataset is not None and isinstance(self.dataset, str):
234-
if self.dataset not in ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]:
239+
llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]
240+
stable_diffusion_datasets = [
241+
"conceptual_captions",
242+
"laion/220k-GPT4Vision-captions-from-LIVIS",
243+
"laion/filtered-wit"
244+
]
245+
if self.dataset not in llm_datasets + stable_diffusion_datasets:
235246
raise ValueError(
236247
f"""You have entered a string value for dataset. You can only choose between
237-
['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
248+
{llm_datasets} for LLLMs or {stable_diffusion_datasets} for SD models, but we found {self.dataset}"""
238249
)
239250

240251
if self.bits not in [4, 8]:

optimum/intel/openvino/modeling_diffusion.py

+62-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import importlib
1616
import logging
17+
import math
1718
import os
1819
import shutil
1920
from pathlib import Path
@@ -274,9 +275,17 @@ def _from_pretrained(
274275
kwargs[name] = load_method(new_model_save_dir)
275276

276277
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
277-
unet = cls.load_model(
278-
new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, quantization_config
279-
)
278+
279+
dataset = None
280+
if quantization_config:
281+
dataset = quantization_config.dataset
282+
quantization_config.dataset = None # apply weight compression without dataset
283+
284+
unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name
285+
if quantization_config and dataset is None:
286+
unet = cls.load_model(unet_path, quantization_config)
287+
else:
288+
unet = cls.load_model(unet_path)
280289

281290
components = {
282291
"vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
@@ -291,6 +300,32 @@ def _from_pretrained(
291300
if model_save_dir is None:
292301
model_save_dir = new_model_save_dir
293302

303+
if quantization_config and dataset is not None:
304+
sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
305+
306+
supported_pipelines = (
307+
OVStableDiffusionPipeline,
308+
OVStableDiffusionXLPipeline,
309+
OVLatentConsistencyModelPipeline
310+
)
311+
if not isinstance(sd_model, supported_pipelines):
312+
raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}")
313+
314+
num_inference_steps = 4 if isinstance(cls, OVLatentConsistencyModelPipeline) else 50
315+
quantization_config.dataset = dataset
316+
317+
if isinstance(quantization_config.dataset, str):
318+
from .quantization import get_stable_diffusion_dataset
319+
dataset_name = quantization_config.dataset
320+
num_samples = math.ceil(quantization_config.subset_size / num_inference_steps)
321+
quantization_config.dataset = get_stable_diffusion_dataset(dataset_name, num_samples)
322+
323+
unet_inputs = sd_model.prepare_inputs(quantization_config.dataset, quantization_config.subset_size, num_inference_steps)
324+
quantization_config.dataset = unet_inputs
325+
326+
from .quantization import _hybrid_quantization
327+
unet = _hybrid_quantization(sd_model.unet.model, quantization_config)
328+
294329
return cls(
295330
unet=unet,
296331
config=config,
@@ -300,6 +335,30 @@ def _from_pretrained(
300335
**kwargs,
301336
)
302337

338+
def prepare_inputs(
339+
self,
340+
dataset: "Dataset",
341+
subset_size: int,
342+
num_inference_steps: int,
343+
height: Optional[int] = 512,
344+
width: Optional[int] = 512,
345+
**kwargs,
346+
) -> "Dataset":
347+
self.compile()
348+
calibration_data = []
349+
350+
from .quantization import InferRequestWrapper
351+
self.unet.request = InferRequestWrapper(self.unet.request, calibration_data)
352+
for prompt in dataset.get_inference_data():
353+
_ = self.__call__(prompt, num_inference_steps=num_inference_steps, height=height, width=width)
354+
if len(calibration_data) >= subset_size:
355+
break
356+
self.unet.request = self.unet.request.request
357+
358+
from nncf import Dataset
359+
return Dataset(calibration_data)
360+
361+
303362
@classmethod
304363
def _from_transformers(
305364
cls,

optimum/intel/openvino/quantization.py

+104
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
import inspect
1717
import logging
1818
import os
19+
from collections import deque
20+
from copy import deepcopy
21+
from datasets import load_dataset
1922
from pathlib import Path
2023
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
2124

@@ -24,6 +27,7 @@
2427
import torch
2528
import transformers
2629
from nncf import CompressWeightsMode, IgnoredScope, NNCFConfig, SensitivityMetric
30+
from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters
2731
from nncf.torch import create_compressed_model, register_default_init_args, register_module
2832
from nncf.torch.dynamic_graph.io_handling import wrap_nncf_model_inputs_with_objwalk
2933
from nncf.torch.initialization import PTInitializingDataLoader
@@ -584,4 +588,104 @@ def _weight_only_quantization(
584588
# awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0
585589
ignored_scope=ignored_scope,
586590
dataset=dataset,
591+
subset_size=config.subset_size,
587592
)
593+
594+
595+
def _get_operation_const_op(operation, const_port_id: int):
596+
node = operation.input_value(const_port_id).get_node()
597+
queue = deque([node])
598+
constant_node = None
599+
allowed_propagation_types_list = ["Convert", "FakeQuantize", "Reshape"]
600+
601+
while len(queue) != 0:
602+
curr_node = queue.popleft()
603+
if curr_node.get_type_name() == "Constant":
604+
constant_node = curr_node
605+
break
606+
if len(curr_node.inputs()) == 0:
607+
break
608+
if curr_node.get_type_name() in allowed_propagation_types_list:
609+
queue.append(curr_node.input_value(0).get_node())
610+
611+
return constant_node
612+
613+
614+
def _is_embedding(node) -> bool:
615+
allowed_types_list = ["f16", "f32", "f64"]
616+
const_port_id = 0
617+
input_tensor = node.input_value(const_port_id)
618+
if input_tensor.get_element_type().get_type_name() in allowed_types_list:
619+
const_node = _get_operation_const_op(node, const_port_id)
620+
if const_node is not None:
621+
return True
622+
623+
return False
624+
625+
626+
def _collect_ops_with_weights(model):
627+
ops_with_weights = []
628+
for op in model.get_ops():
629+
if op.get_type_name() == "MatMul":
630+
constant_node_0 = _get_operation_const_op(op, const_port_id=0)
631+
constant_node_1 = _get_operation_const_op(op, const_port_id=1)
632+
if constant_node_0 or constant_node_1:
633+
ops_with_weights.append(op.get_friendly_name())
634+
if op.get_type_name() == "Gather" and _is_embedding(op):
635+
ops_with_weights.append(op.get_friendly_name())
636+
637+
return ops_with_weights
638+
639+
640+
def get_stable_diffusion_dataset(
641+
dataset_name: str, nsamples: int = 50, seed: int = 0, text_column: str = "caption"
642+
) -> nncf.Dataset:
643+
if dataset_name not in [
644+
"conceptual_captions",
645+
"laion/220k-GPT4Vision-captions-from-LIVIS",
646+
"laion/filtered-wit"
647+
]:
648+
raise ValueError(
649+
f"""You have entered a string value for dataset. You can only choose between
650+
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'],
651+
but we found {dataset_name}"""
652+
)
653+
654+
data = load_dataset(dataset_name, split="train", streaming=True).shuffle(seed=seed).take(nsamples)
655+
dataset = [batch[text_column] for batch in data]
656+
return nncf.Dataset(dataset)
657+
658+
659+
def _hybrid_quantization(
660+
model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict]
661+
):
662+
dataset = quantization_config.dataset
663+
wc_ignored_scope = deepcopy(quantization_config.ignored_scope)
664+
665+
if isinstance(wc_ignored_scope, dict):
666+
wc_ignored_scope["types"] = wc_ignored_scope.get("types", []) + ["Convolution"]
667+
else:
668+
assert wc_ignored_scope is None
669+
wc_ignored_scope = {"types": ["Convolution"]}
670+
671+
ops_to_compress = _collect_ops_with_weights(model)
672+
ptq_ignored_scope = deepcopy(quantization_config.ignored_scope)
673+
if isinstance(ptq_ignored_scope, dict):
674+
ptq_ignored_scope["names"] = ptq_ignored_scope.get("names", []) + ops_to_compress
675+
else:
676+
assert ptq_ignored_scope is None
677+
ptq_ignored_scope = {"names": ops_to_compress}
678+
679+
quantization_config.dataset = None # Apply Weight Compression without dataset
680+
quantization_config.ignored_scope = wc_ignored_scope
681+
compressed_model = _weight_only_quantization(model, quantization_config)
682+
683+
quantized_model = nncf.quantize(
684+
compressed_model,
685+
dataset,
686+
model_type=nncf.ModelType.TRANSFORMER,
687+
ignored_scope=nncf.IgnoredScope(**ptq_ignored_scope),
688+
advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)),
689+
subset_size=quantization_config.subset_size,
690+
)
691+
return quantized_model

tests/openvino/test_quantization.py

+41
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939

4040
from optimum.intel import (
4141
OVConfig,
42+
OVLatentConsistencyModelPipeline,
4243
OVModelForAudioClassification,
4344
OVModelForCausalLM,
4445
OVModelForFeatureExtraction,
@@ -233,6 +234,12 @@ class OVWeightCompressionTest(unittest.TestCase):
233234
(OVStableDiffusionXLPipeline, "stable-diffusion-xl"),
234235
)
235236

237+
SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = (
238+
(OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
239+
(OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331),
240+
(OVLatentConsistencyModelPipeline, "latent-consistency", 50, 135),
241+
)
242+
236243
IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3")
237244

238245
DEFAULT_INT4_CONFIG = {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}
@@ -352,6 +359,40 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
352359
_, num_int8, _ = get_num_quantized_nodes(model)
353360
self.assertEqual(expected_ov_int8[i], num_int8)
354361

362+
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
363+
def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8):
364+
model_id = MODEL_NAMES[model_type]
365+
quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", subset_size=5)
366+
with tempfile.TemporaryDirectory() as tmp_dir:
367+
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
368+
369+
num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
370+
self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
371+
self.assertEqual(expected_ov_int8, num_int8)
372+
self.assertEqual(0, num_int4)
373+
374+
model.save_pretrained(tmp_dir)
375+
376+
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
377+
def test_ovmodel_hybrid_quantization_with_custom_dataset(
378+
self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8
379+
):
380+
model_id = MODEL_NAMES[model_type]
381+
dataset_name = "daspartho/stable-diffusion-prompts"
382+
dataset = load_dataset(dataset_name, split="train", streaming=True)
383+
quantization_dataset = nncf.Dataset(dataset, lambda x: x["prompt"])
384+
model = model_cls.from_pretrained(
385+
model_id,
386+
export=True,
387+
quantization_config=OVWeightQuantizationConfig(
388+
bits=8, dataset=quantization_dataset, subset_size=3
389+
),
390+
)
391+
num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
392+
self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
393+
self.assertEqual(expected_ov_int8, num_int8)
394+
self.assertEqual(0, num_int4)
395+
355396
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS)
356397
@unittest.mock.patch.dict(
357398
"optimum.intel.openvino.configuration._DEFAULT_4BIT_CONFIGS", {"facebook/opt-125m": DEFAULT_INT4_CONFIG}

tests/openvino/utils_tests.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ def get_num_quantized_nodes(ov_model):
127127
if "FakeQuantize" in elem.name:
128128
num_fake_quantize += 1
129129
for i in range(elem.get_output_size()):
130-
if "8" in elem.get_output_element_type(i).get_type_name():
130+
if elem.get_output_element_type(i).get_type_name() in ["i8", "u8"]:
131131
num_int8 += 1
132-
if "4" in elem.get_output_element_type(i).get_type_name():
132+
if elem.get_output_element_type(i).get_type_name() in ["i4", "u4"]:
133133
num_int4 += 1
134134
return num_fake_quantize, num_int8, num_int4

0 commit comments

Comments
 (0)