Skip to content

Commit 2dc4087

Browse files
committed
Apply comments
1 parent 067c6d5 commit 2dc4087

File tree

4 files changed

+24
-19
lines changed

4 files changed

+24
-19
lines changed

docs/source/optimization_ov.mdx

+3-3
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,10 @@ model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
7171

7272
## Hybrid quantization
7373

74-
Traditional optimization methods like post-training 8-bit quantization do not work for Stable Diffusion models because accuracy drops significantly. On the other hand, weight compression does not improve performance when applied to Stable Diffusion models, as the size of activations is comparable to weights.
74+
Traditional optimization methods like post-training 8-bit quantization do not work well for Stable Diffusion models and can lead to poor generation results. On the other hand, weight compression does not improve performance significantly when applied to Stable Diffusion models, as the size of activations is comparable to weights.
7575
The UNet model takes up most of the overall execution time of the pipeline. Thus, optimizing just one model brings substantial benefits in terms of inference speed while keeping acceptable accuracy without fine-tuning. Quantizing the rest of the diffusion pipeline does not significantly improve inference performance but could potentially lead to substantial degradation of accuracy.
76-
Therefore, the proposal is to apply quantization in hybrid mode for the UNet model and weight-only quantization for other pipeline components. The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and activations of other layers, facilitating accuracy preservation post-optimization while reducing the model size.
77-
For optimizing the Stable Diffusion pipeline, utilize the `quantization_config` to define optimization parameters. To enable hybrid quantization, specify the quantization dataset in the `quantization_config`; otherwise, weight-only quantization in specified precisions will be applied to UNet.
76+
Therefore, the proposal is to apply quantization in *hybrid mode* for the UNet model and weight-only quantization for the rest of the pipeline components. The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and activations of other layers, facilitating accuracy preservation post-optimization while reducing the model size.
77+
The `quantization_config` is utilized to define optimization parameters for optimizing the Stable Diffusion pipeline. To enable hybrid quantization, specify the quantization dataset in the `quantization_config`. Otherwise, weight-only quantization to a specified data type (8 tr 4 bits) is applied to UNet model.
7878

7979
```python
8080
from optimum.intel import OVStableDiffusionPipeline, OVWeightQuantizationConfig

optimum/intel/openvino/configuration.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
179179
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
180180
dataset (`str or List[str]`, *optional*):
181181
The dataset used for data-aware compression or quantization with NNCF. You can provide your own dataset
182-
in a list of string or just use the the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs
183-
or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for SD models.
182+
in a list of strings or just use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs
183+
or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models.
184184
ratio (`float`, defaults to 1.0):
185185
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
186186
and the rest to INT8_ASYM).
@@ -243,7 +243,7 @@ def post_init(self):
243243
if self.dataset not in llm_datasets + stable_diffusion_datasets:
244244
raise ValueError(
245245
f"""You have entered a string value for dataset. You can only choose between
246-
{llm_datasets} for LLLMs or {stable_diffusion_datasets} for SD models, but we found {self.dataset}"""
246+
{llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
247247
)
248248

249249
if self.bits not in [4, 8]:

optimum/intel/openvino/modeling_diffusion.py

+15-11
Original file line numberDiff line numberDiff line change
@@ -282,16 +282,17 @@ def _from_pretrained(
282282

283283
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
284284

285-
dataset = None
286285
unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name
287286
if quantization_config is not None and quantization_config.dataset is not None:
288-
dataset = quantization_config.dataset
289287
# load the UNet model uncompressed to apply hybrid quantization further
290288
unet = cls.load_model(unet_path)
291289
# Apply weights compression to other `components` without dataset
292-
quantization_config.dataset = None
290+
q_config_params = quantization_config.__dict__
291+
wc_params = {param: value for param, value in q_config_params.items() if param != "dataset"}
292+
wc_quantization_config = OVWeightQuantizationConfig.from_dict(wc_params)
293293
else:
294-
unet = cls.load_model(unet_path, quantization_config)
294+
wc_quantization_config = quantization_config
295+
unet = cls.load_model(unet_path, wc_quantization_config)
295296

296297
components = {
297298
"vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
@@ -301,12 +302,12 @@ def _from_pretrained(
301302
}
302303

303304
for key, value in components.items():
304-
components[key] = cls.load_model(value, quantization_config) if value.is_file() else None
305+
components[key] = cls.load_model(value, wc_quantization_config) if value.is_file() else None
305306

306307
if model_save_dir is None:
307308
model_save_dir = new_model_save_dir
308309

309-
if dataset is not None:
310+
if quantization_config is not None and quantization_config.dataset is not None:
310311
sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
311312

312313
supported_pipelines = (
@@ -318,12 +319,11 @@ def _from_pretrained(
318319
raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}")
319320

320321
nsamples = quantization_config.num_samples if quantization_config.num_samples else 200
321-
unet_inputs = sd_model._prepare_unet_inputs(dataset, nsamples)
322+
unet_inputs = sd_model._prepare_unet_inputs(quantization_config.dataset, nsamples)
322323

323324
from .quantization import _hybrid_quantization
324325

325-
unet = _hybrid_quantization(sd_model.unet.model, quantization_config, dataset=unet_inputs)
326-
quantization_config.dataset = dataset
326+
unet = _hybrid_quantization(sd_model.unet.model, wc_quantization_config, dataset=unet_inputs)
327327

328328
return cls(
329329
unet=unet,
@@ -338,13 +338,17 @@ def _prepare_unet_inputs(
338338
self,
339339
dataset: Union[str, List[Any]],
340340
num_samples: int,
341-
height: Optional[int] = 512,
342-
width: Optional[int] = 512,
341+
height: Optional[int] = None,
342+
width: Optional[int] = None,
343343
seed: Optional[int] = 42,
344344
**kwargs,
345345
) -> Dict[str, Any]:
346346
self.compile()
347347

348+
size = self.unet.config.get("sample_size", 64) * self.vae_scale_factor
349+
height = height or min(size, 512)
350+
width = width or min(size, 512)
351+
348352
if isinstance(dataset, str):
349353
dataset = deepcopy(dataset)
350354
available_datasets = PREDEFINED_SD_DATASETS.keys()

tests/openvino/test_quantization.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ class OVWeightCompressionTest(unittest.TestCase):
159159
)
160160

161161
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 86),)
162-
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 150),)
162+
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 148),)
163163
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS = (
164164
(OVModelForCausalLM, "hf-internal-testing/tiny-random-OPTForCausalLM", 14, 50),
165165
)
@@ -236,6 +236,7 @@ class OVWeightCompressionTest(unittest.TestCase):
236236

237237
SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = (
238238
(OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
239+
(OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331),
239240
(OVLatentConsistencyModelPipeline, "latent-consistency", 50, 135),
240241
)
241242

@@ -372,7 +373,7 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f
372373

373374
model.save_pretrained(tmp_dir)
374375

375-
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
376+
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:])
376377
def test_ovmodel_hybrid_quantization_with_custom_dataset(
377378
self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8
378379
):

0 commit comments

Comments
 (0)