Skip to content

Commit b17d1e0

Browse files
[OV] Fix data-free VLM compression via optimum-cli (#1058)
* Fix vlm compression * Extend compression tests to check submodel weights precision * Update references * Fix condition * Export in auto dtype if possible * Reformat condition
1 parent 9efb8e3 commit b17d1e0

File tree

5 files changed

+171
-59
lines changed

5 files changed

+171
-59
lines changed

optimum/commands/export/openvino.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,9 @@ def run(self):
368368
model.save_pretrained(self.args.output)
369369
if not self.args.disable_convert_tokenizer:
370370
maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
371-
elif (task.startswith("text-generation") or task == "image-text-to-text") and quantize_with_dataset:
371+
elif (task.startswith("text-generation") and quantize_with_dataset) or (
372+
task == "image-text-to-text" and quantization_config is not None
373+
):
372374
if task.startswith("text-generation"):
373375
from optimum.intel import OVModelForCausalLM
374376

@@ -378,7 +380,7 @@ def run(self):
378380

379381
model_cls = OVModelForVisualCausalLM
380382

381-
# To quantize a model with a dataset, an instance of a model class is required
383+
# In this case, to apply quantization an instance of a model class is required
382384
model = model_cls.from_pretrained(
383385
self.args.model,
384386
export=True,

optimum/intel/openvino/modeling_visual_language.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,8 @@ def _from_transformers(
598598
if load_in_8bit is None and not quantization_config:
599599
ov_config = None
600600
else:
601-
ov_config = OVConfig(dtype="fp32")
601+
# Export in fp32 if compression won't be applied later
602+
ov_config = OVConfig(dtype="fp32" if load_in_8bit is False else "auto")
602603

603604
stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
604605

tests/openvino/test_exporters_cli.py

+78-23
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,14 @@
1414
import subprocess
1515
import unittest
1616
from pathlib import Path
17+
from typing import Dict, List
1718

1819
from parameterized import parameterized
1920
from transformers import AutoModelForCausalLM
2021
from utils_tests import (
2122
_ARCHITECTURES_TO_EXPECTED_INT8,
2223
MODEL_NAMES,
24+
compare_num_quantized_nodes_per_model,
2325
get_num_quantized_nodes,
2426
)
2527

@@ -108,37 +110,47 @@ class OVCLIExportTestCase(unittest.TestCase):
108110
SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("flux", 7, 56))
109111

110112
TEST_4BIT_CONFIGURATIONS = [
111-
("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}),
112-
("text-generation-with-past", "opt125m", "int4 --group-size 64", {"int8": 4, "int4": 144}),
113-
("text-generation-with-past", "opt125m", "mxfp4", {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}),
114-
("text-generation-with-past", "opt125m", "nf4", {"int8": 4, "nf4": 72}),
115-
("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 8 --all-layers", {"int4": 16}),
113+
("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", [{"int8": 4, "int4": 72}]),
114+
("text-generation-with-past", "opt125m", "int4 --group-size 64", [{"int8": 4, "int4": 144}]),
115+
("text-generation-with-past", "opt125m", "mxfp4", [{"int8": 4, "f4e2m1": 72, "f8e8m0": 72}]),
116+
("text-generation-with-past", "opt125m", "nf4", [{"int8": 4, "nf4": 72}]),
117+
(
118+
"text-generation-with-past",
119+
"llama_awq",
120+
"int4 --ratio 1.0 --sym --group-size 8 --all-layers",
121+
[{"int4": 16}],
122+
),
116123
(
117124
"text-generation-with-past",
118125
"llama_awq",
119126
"int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 "
120127
"--sensitivity-metric max_activation_variance",
121-
{"int8": 4, "int4": 14},
128+
[{"int8": 4, "int4": 14}],
122129
),
123130
(
124131
"text-generation-with-past",
125132
"llama_awq",
126133
"int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ",
127-
{"int8": 4, "int4": 14},
134+
[{"int8": 4, "int4": 14}],
128135
),
129136
(
130137
"text-generation-with-past",
131138
"llama_awq",
132139
"int4 --ratio 1.0 --sym --group-size 16 --gptq --dataset wikitext2 --num-samples 100 ",
133-
{"int8": 4, "int4": 14},
140+
[{"int8": 4, "int4": 14}],
134141
),
135142
(
136143
"text-generation-with-past",
137144
"llama_awq",
138145
"int4 --ratio 1.0 --sym --group-size 16 --lora-correction --dataset auto --num-samples 16",
139-
{"int8": 60, "int4": 14},
146+
[{"int8": 60, "int4": 14}],
147+
),
148+
(
149+
"text-generation-with-past",
150+
"llama_awq",
151+
"int4 --group-size 16 --backup-precision none --ratio 0.5",
152+
[{"int4": 6}],
140153
),
141-
("text-generation-with-past", "llama_awq", "int4 --group-size 16 --backup-precision none", {"int4": 28}),
142154
]
143155

144156
if is_transformers_version(">=", "4.40.0"):
@@ -147,36 +159,73 @@ class OVCLIExportTestCase(unittest.TestCase):
147159
(
148160
"image-text-to-text",
149161
"llava_next",
150-
'int4 --group-size 16 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
162+
"int4 --group-size 16 --ratio 0.8",
163+
[{"int8": 14, "int4": 16}, {"int8": 9}, {"int8": 1}],
164+
),
165+
(
166+
"image-text-to-text",
167+
"llava_next",
168+
'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" '
151169
"--dataset contextual --num-samples 1",
152-
{"int8": 8, "int4": 22},
170+
[{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}],
171+
),
172+
(
173+
"image-text-to-text",
174+
"nanollava",
175+
"int4 --group-size 8 --ratio 0.8 --trust-remote-code",
176+
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
153177
),
154178
(
155179
"image-text-to-text",
156180
"nanollava",
157-
'int4 --group-size 8 --ratio 0.9 --sensitivity-metric "mean_activation_variance" '
181+
'int4 --group-size 8 --ratio 0.8 --sensitivity-metric "mean_activation_variance" '
158182
"--dataset contextual --num-samples 1 --trust-remote-code",
159-
{"int8": 12, "int4": 18},
183+
[{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
160184
),
161185
]
162186
)
163187

164188
if is_transformers_version(">=", "4.45.0"):
165189
TEST_4BIT_CONFIGURATIONS.extend(
166190
[
191+
(
192+
"image-text-to-text",
193+
"minicpmv",
194+
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
195+
[{"int8": 10, "int4": 20}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
196+
),
197+
(
198+
"image-text-to-text",
199+
"minicpmv",
200+
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
201+
"--dataset contextual --num-samples 1 --trust-remote-code",
202+
[{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}],
203+
),
204+
(
205+
"image-text-to-text",
206+
"internvl2",
207+
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
208+
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
209+
),
167210
(
168211
"image-text-to-text",
169212
"internvl2",
170-
'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "hessian_input_activation" '
213+
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
171214
"--dataset contextual --num-samples 1 --trust-remote-code",
172-
{"int8": 6, "int4": 24},
215+
[{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}],
216+
),
217+
(
218+
"image-text-to-text",
219+
"phi3_v",
220+
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
221+
[{"int8": 8, "int4": 10}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
173222
),
174223
(
175224
"image-text-to-text",
176225
"phi3_v",
177-
'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
226+
'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" '
178227
"--dataset contextual --num-samples 1 --trust-remote-code",
179-
{"int8": 4, "int4": 14},
228+
[{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}],
180229
),
181230
]
182231
)
@@ -300,7 +349,9 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
300349
self.assertEqual(exp_num_fq, num_fq)
301350

302351
@parameterized.expand(TEST_4BIT_CONFIGURATIONS)
303-
def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict):
352+
def test_exporters_cli_4bit(
353+
self, task: str, model_type: str, option: str, expected_num_weight_nodes_per_model: List[Dict]
354+
):
304355
with TemporaryDirectory() as tmpdir:
305356
result = subprocess.run(
306357
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
@@ -317,11 +368,15 @@ def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expec
317368
else _HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]
318369
).from_pretrained(tmpdir, **model_kwargs)
319370

320-
ov_model = model.lm_model if task == "image-text-to-text" else model.model
371+
submodels = []
372+
if task == "text-generation-with-past":
373+
submodels = [model]
374+
elif task == "image-text-to-text":
375+
submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
376+
submodels += [getattr(model, part) for part in model.additional_parts]
377+
378+
compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model)
321379

322-
_, num_weight_nodes = get_num_quantized_nodes(ov_model)
323-
expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
324-
self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
325380
self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
326381
self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)
327382
self.assertTrue("--gptq" not in option or b"Applying GPTQ" in result.stdout)

0 commit comments

Comments
 (0)