Skip to content

Commit d69470a

Browse files
committed
Add hybrid quantization for StableDiffusion pipelines
1 parent 2d14e25 commit d69470a

File tree

5 files changed

+225
-8
lines changed

5 files changed

+225
-8
lines changed

optimum/intel/openvino/configuration.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,8 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
178178
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
179179
dataset (`Union[List[str]]`, *optional*):
180180
The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the
181-
the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new']
181+
the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs or
182+
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for SD models
182183
group_size (`int`, *optional*, defaults to 128):
183184
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
184185
ratio (`float`, *optional*, defaults to 1.0):
@@ -193,6 +194,8 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
193194
Enables AWQ method to unify weight ranges and improve overall model accuracy.
194195
ignored_scope (`nncf.IgnoredScope`, *optional*):
195196
An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.
197+
subset_size (`int`, *optional*, defaults to 128):
198+
Number of data samples to calculate activation statistics.
196199
197200
"""
198201

@@ -207,6 +210,7 @@ def __init__(
207210
all_layers: Optional[bool] = None,
208211
sensitivity_metric: Optional[str] = None,
209212
ignored_scope: Optional[dict] = None,
213+
subset_size: int = 128,
210214
**kwargs,
211215
):
212216
self.bits = bits
@@ -218,6 +222,7 @@ def __init__(
218222
self.all_layers = all_layers
219223
self.sensitivity_metric = sensitivity_metric
220224
self.ignored_scope = ignored_scope
225+
self.subset_size = subset_size
221226
self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release
222227
self.post_init()
223228

@@ -230,10 +235,16 @@ def post_init(self):
230235
if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
231236
raise ValueError("`group_size` must be greater than 0 or equal to -1")
232237
if self.dataset is not None and isinstance(self.dataset, str):
233-
if self.dataset not in ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]:
238+
llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]
239+
stable_diffusion_datasets = [
240+
"conceptual_captions",
241+
"laion/220k-GPT4Vision-captions-from-LIVIS",
242+
"laion/filtered-wit"
243+
]
244+
if self.dataset not in llm_datasets + stable_diffusion_datasets:
234245
raise ValueError(
235246
f"""You have entered a string value for dataset. You can only choose between
236-
['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
247+
{llm_datasets} for LLLMs or {stable_diffusion_datasets} for SD models, but we found {self.dataset}"""
237248
)
238249

239250
if self.bits not in [4, 8]:

optimum/intel/openvino/modeling_diffusion.py

+64-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import importlib
1616
import logging
17+
import math
1718
import os
1819
import shutil
1920
from pathlib import Path
@@ -261,9 +262,19 @@ def _from_pretrained(
261262
if load_in_8bit:
262263
quantization_config = quantization_config or {"bits": 8}
263264

264-
unet = cls.load_model(
265-
new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, quantization_config
266-
)
265+
if isinstance(quantization_config, dict):
266+
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
267+
268+
dataset = None
269+
if quantization_config:
270+
dataset = quantization_config.dataset
271+
quantization_config.dataset = None # apply weight compression without dataset
272+
273+
unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name
274+
if quantization_config and dataset is None:
275+
unet = cls.load_model(unet_path, quantization_config)
276+
else:
277+
unet = cls.load_model(unet_path)
267278

268279
components = {
269280
"vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
@@ -278,8 +289,58 @@ def _from_pretrained(
278289
if model_save_dir is None:
279290
model_save_dir = new_model_save_dir
280291

292+
if quantization_config and dataset is not None:
293+
sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
294+
295+
supported_pipelines = (
296+
OVStableDiffusionPipeline,
297+
OVStableDiffusionXLPipeline,
298+
OVLatentConsistencyModelPipeline
299+
)
300+
if not isinstance(sd_model, supported_pipelines):
301+
raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}")
302+
303+
num_inference_steps = 4 if isinstance(cls, OVLatentConsistencyModelPipeline) else 50
304+
quantization_config.dataset = dataset
305+
306+
if isinstance(quantization_config.dataset, str):
307+
from .quantization import get_stable_diffusion_dataset
308+
dataset_name = quantization_config.dataset
309+
num_samples = math.ceil(quantization_config.subset_size / num_inference_steps)
310+
quantization_config.dataset = get_stable_diffusion_dataset(dataset_name, num_samples)
311+
312+
unet_inputs = sd_model.prepare_inputs(quantization_config.dataset, quantization_config.subset_size, num_inference_steps)
313+
quantization_config.dataset = unet_inputs
314+
315+
from .quantization import _hybrid_quantization
316+
unet = _hybrid_quantization(sd_model.unet.model, quantization_config)
317+
281318
return cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
282319

320+
def prepare_inputs(
321+
self,
322+
dataset: "Dataset",
323+
subset_size: int,
324+
num_inference_steps: int,
325+
height: Optional[int] = 512,
326+
width: Optional[int] = 512,
327+
**kwargs,
328+
) -> "Dataset":
329+
self.compile()
330+
calibration_data = []
331+
332+
from .quantization import InferRequestWrapper
333+
self.unet.request = InferRequestWrapper(self.unet.request, calibration_data)
334+
for prompt in dataset.get_inference_data():
335+
_ = self.__call__(prompt, num_inference_steps=num_inference_steps, height=height, width=width)
336+
if len(calibration_data) >= subset_size:
337+
break
338+
self.unet.request = self.unet.request.request
339+
340+
from nncf import Dataset
341+
return Dataset(calibration_data)
342+
343+
283344
@classmethod
284345
def _from_transformers(
285346
cls,

optimum/intel/openvino/quantization.py

+104
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
import inspect
1616
import logging
1717
import os
18+
from collections import deque
19+
from copy import deepcopy
20+
from datasets import load_dataset
1821
from pathlib import Path
1922
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
2023

@@ -23,6 +26,7 @@
2326
import torch
2427
import transformers
2528
from nncf import CompressWeightsMode, IgnoredScope, NNCFConfig, SensitivityMetric
29+
from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters
2630
from nncf.torch import create_compressed_model, register_default_init_args, register_module
2731
from nncf.torch.dynamic_graph.io_handling import wrap_nncf_model_inputs_with_objwalk
2832
from nncf.torch.initialization import PTInitializingDataLoader
@@ -577,4 +581,104 @@ def _weight_only_quantization(
577581
# awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0
578582
ignored_scope=ignored_scope,
579583
dataset=dataset,
584+
subset_size=config.subset_size,
580585
)
586+
587+
588+
def _get_operation_const_op(operation, const_port_id: int):
589+
node = operation.input_value(const_port_id).get_node()
590+
queue = deque([node])
591+
constant_node = None
592+
allowed_propagation_types_list = ["Convert", "FakeQuantize", "Reshape"]
593+
594+
while len(queue) != 0:
595+
curr_node = queue.popleft()
596+
if curr_node.get_type_name() == "Constant":
597+
constant_node = curr_node
598+
break
599+
if len(curr_node.inputs()) == 0:
600+
break
601+
if curr_node.get_type_name() in allowed_propagation_types_list:
602+
queue.append(curr_node.input_value(0).get_node())
603+
604+
return constant_node
605+
606+
607+
def _is_embedding(node) -> bool:
608+
allowed_types_list = ["f16", "f32", "f64"]
609+
const_port_id = 0
610+
input_tensor = node.input_value(const_port_id)
611+
if input_tensor.get_element_type().get_type_name() in allowed_types_list:
612+
const_node = _get_operation_const_op(node, const_port_id)
613+
if const_node is not None:
614+
return True
615+
616+
return False
617+
618+
619+
def _collect_ops_with_weights(model):
620+
ops_with_weights = []
621+
for op in model.get_ops():
622+
if op.get_type_name() == "MatMul":
623+
constant_node_0 = _get_operation_const_op(op, const_port_id=0)
624+
constant_node_1 = _get_operation_const_op(op, const_port_id=1)
625+
if constant_node_0 or constant_node_1:
626+
ops_with_weights.append(op.get_friendly_name())
627+
if op.get_type_name() == "Gather" and _is_embedding(op):
628+
ops_with_weights.append(op.get_friendly_name())
629+
630+
return ops_with_weights
631+
632+
633+
def get_stable_diffusion_dataset(
634+
dataset_name: str, nsamples: int = 50, seed: int = 0, text_column: str = "caption"
635+
) -> nncf.Dataset:
636+
if dataset_name not in [
637+
"conceptual_captions",
638+
"laion/220k-GPT4Vision-captions-from-LIVIS",
639+
"laion/filtered-wit"
640+
]:
641+
raise ValueError(
642+
f"""You have entered a string value for dataset. You can only choose between
643+
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'],
644+
but we found {dataset_name}"""
645+
)
646+
647+
data = load_dataset(dataset_name, split="train", streaming=True).shuffle(seed=seed).take(nsamples)
648+
dataset = [batch[text_column] for batch in data]
649+
return nncf.Dataset(dataset)
650+
651+
652+
def _hybrid_quantization(
653+
model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict]
654+
):
655+
dataset = quantization_config.dataset
656+
wc_ignored_scope = deepcopy(quantization_config.ignored_scope)
657+
658+
if isinstance(wc_ignored_scope, dict):
659+
wc_ignored_scope["types"] = wc_ignored_scope.get("types", []) + ["Convolution"]
660+
else:
661+
assert wc_ignored_scope is None
662+
wc_ignored_scope = {"types": ["Convolution"]}
663+
664+
ops_to_compress = _collect_ops_with_weights(model)
665+
ptq_ignored_scope = deepcopy(quantization_config.ignored_scope)
666+
if isinstance(ptq_ignored_scope, dict):
667+
ptq_ignored_scope["names"] = ptq_ignored_scope.get("names", []) + ops_to_compress
668+
else:
669+
assert ptq_ignored_scope is None
670+
ptq_ignored_scope = {"names": ops_to_compress}
671+
672+
quantization_config.dataset = None # Apply Weight Compression without dataset
673+
quantization_config.ignored_scope = wc_ignored_scope
674+
compressed_model = _weight_only_quantization(model, quantization_config)
675+
676+
quantized_model = nncf.quantize(
677+
compressed_model,
678+
dataset,
679+
model_type=nncf.ModelType.TRANSFORMER,
680+
ignored_scope=nncf.IgnoredScope(**ptq_ignored_scope),
681+
advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)),
682+
subset_size=quantization_config.subset_size,
683+
)
684+
return quantized_model

tests/openvino/test_quantization.py

+41
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636

3737
from optimum.intel import (
3838
OVConfig,
39+
OVLatentConsistencyModelPipeline,
3940
OVModelForAudioClassification,
4041
OVModelForCausalLM,
4142
OVModelForFeatureExtraction,
@@ -230,6 +231,12 @@ class OVWeightCompressionTest(unittest.TestCase):
230231
(OVStableDiffusionXLPipeline, "stable-diffusion-xl"),
231232
)
232233

234+
SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = (
235+
(OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
236+
(OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331),
237+
(OVLatentConsistencyModelPipeline, "latent-consistency", 50, 135),
238+
)
239+
233240
IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3")
234241

235242
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS)
@@ -345,6 +352,40 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
345352
_, num_int8, _ = get_num_quantized_nodes(model)
346353
self.assertEqual(expected_ov_int8[i], num_int8)
347354

355+
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
356+
def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8):
357+
model_id = MODEL_NAMES[model_type]
358+
quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", subset_size=5)
359+
with tempfile.TemporaryDirectory() as tmp_dir:
360+
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
361+
362+
num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
363+
self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
364+
self.assertEqual(expected_ov_int8, num_int8)
365+
self.assertEqual(0, num_int4)
366+
367+
model.save_pretrained(tmp_dir)
368+
369+
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
370+
def test_ovmodel_hybrid_quantization_with_custom_dataset(
371+
self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8
372+
):
373+
model_id = MODEL_NAMES[model_type]
374+
dataset_name = "daspartho/stable-diffusion-prompts"
375+
dataset = load_dataset(dataset_name, split="train", streaming=True)
376+
quantization_dataset = nncf.Dataset(dataset, lambda x: x["prompt"])
377+
model = model_cls.from_pretrained(
378+
model_id,
379+
export=True,
380+
quantization_config=OVWeightQuantizationConfig(
381+
bits=8, dataset=quantization_dataset, subset_size=3
382+
),
383+
)
384+
num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
385+
self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
386+
self.assertEqual(expected_ov_int8, num_int8)
387+
self.assertEqual(0, num_int4)
388+
348389
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS)
349390
def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_int8, expected_ov_int4):
350391
with tempfile.TemporaryDirectory() as tmp_dir:

tests/openvino/utils_tests.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ def get_num_quantized_nodes(ov_model):
127127
if "FakeQuantize" in elem.name:
128128
num_fake_quantize += 1
129129
for i in range(elem.get_output_size()):
130-
if "8" in elem.get_output_element_type(i).get_type_name():
130+
if elem.get_output_element_type(i).get_type_name() in ["i8", "u8"]:
131131
num_int8 += 1
132-
if "4" in elem.get_output_element_type(i).get_type_name():
132+
if elem.get_output_element_type(i).get_type_name() in ["i4", "u4"]:
133133
num_int4 += 1
134134
return num_fake_quantize, num_int8, num_int4

0 commit comments

Comments
 (0)