Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 4aa12d5

Browse files
committedOct 15, 2024·
Quantization support for CausalVisualLMs
1 parent b31524c commit 4aa12d5

File tree

4 files changed

+135
-31
lines changed

4 files changed

+135
-31
lines changed
 

‎optimum/intel/openvino/configuration.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from optimum.configuration_utils import BaseConfig
2727

2828
from ..utils.import_utils import is_nncf_available
29+
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
2930

3031

3132
if is_nncf_available():
@@ -350,6 +351,11 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
350351
qptq (`bool`, *optional*):
351352
Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the
352353
difference between activations of a compressed and original layer. Dataset is required to run GPTQ.
354+
processor (`str`, *optional*):
355+
A transformers processor used to process inputs for multi-modal models. You can pass either:
356+
- A string, the *model id* of a predefined processor hosted inside a model repo on huggingface.co.
357+
- A path to a *directory* containing files required by the processor, for instance saved
358+
using the [`~AutoProcessor.save_pretrained`] method, e.g., `./my_model_directory/`.
353359
"""
354360

355361
def __init__(
@@ -369,6 +375,7 @@ def __init__(
369375
scale_estimation: bool = None,
370376
weight_format: Optional[str] = None,
371377
gptq: bool = None,
378+
processor: Optional[str] = None,
372379
**kwargs,
373380
):
374381
super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
@@ -383,6 +390,7 @@ def __init__(
383390
self.scale_estimation = scale_estimation
384391
self.weight_format = weight_format
385392
self.gptq = gptq
393+
self.processor = processor
386394
self.post_init()
387395

388396
def post_init(self):
@@ -400,16 +408,14 @@ def post_init(self):
400408
f"If you wish to provide a custom dataset, please use the `OVQuantizer` instead."
401409
)
402410
if self.dataset is not None and isinstance(self.dataset, str):
403-
llm_datasets = ["wikitext2", "c4", "c4-new"]
404-
stable_diffusion_datasets = [
405-
"conceptual_captions",
406-
"laion/220k-GPT4Vision-captions-from-LIVIS",
407-
"laion/filtered-wit",
408-
]
409-
if self.dataset not in llm_datasets + stable_diffusion_datasets:
411+
lm_datasets = ["wikitext2", "c4", "c4-new"]
412+
visual_lm_datasets = list(PREDEFINED_VISUAL_LM_DATASETS.keys())
413+
stable_diffusion_datasets = list(PREDEFINED_SD_DATASETS.keys())
414+
if self.dataset not in lm_datasets + visual_lm_datasets + stable_diffusion_datasets:
410415
raise ValueError(
411416
f"""You have entered a string value for dataset. You can only choose between
412-
{llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
417+
{lm_datasets} for LLMs, {visual_lm_datasets} for visual LLMs
418+
or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
413419
)
414420

415421
if self.bits not in [4, 8]:
@@ -444,6 +450,9 @@ def post_init(self):
444450
if self.tokenizer is not None and not isinstance(self.tokenizer, str):
445451
raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}")
446452

453+
if self.processor is not None and not isinstance(self.processor, str):
454+
raise ValueError(f"Processor is expected to be a string, but found {self.processor}")
455+
447456
if self.weight_format is None:
448457
self.weight_format = "int4" if self.bits == 4 else "int8"
449458
if self.weight_format not in ["int4", "int8", "mxfp4"]:

‎optimum/intel/openvino/modeling_visual_language.py

+38-22
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from ...exporters.openvino import main_export
1818
from ...exporters.openvino.stateful import ensure_stateful_is_available
19+
from .. import OVQuantizer
1920
from .configuration import OVConfig, OVWeightQuantizationConfig
2021
from .modeling_base import OVBaseModel, OVModelPart
2122
from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM
@@ -178,6 +179,7 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
178179
]
179180

180181
def forward(self, pixel_values, **kwargs):
182+
self._compile()
181183
result = self.request({"pixel_values": pixel_values})
182184
last_hidden_state = result[0]
183185
hidden_states = None
@@ -221,7 +223,7 @@ def __init__(
221223
self.ov_config = {} if ov_config is None else {**ov_config}
222224
self.preprocessors = kwargs.get("preprocessors", [])
223225
self.lm_model = language_model
224-
self.text_embdings_model = text_embeddings
226+
self.text_embeddings_model = text_embeddings
225227
self.vision_embeddings_model = vision_embeddings
226228
self._supports_cache_class = False
227229
self.main_input_name = "input_ids"
@@ -238,13 +240,13 @@ def __init__(
238240
self._set_ov_config_parameters()
239241
self.language_model = OVModelWithEmbedForCausalLM(
240242
self.lm_model,
241-
self.text_embdings_model,
243+
self.text_embeddings_model,
242244
config=config,
243245
deivce=device,
244246
ov_config=ov_config,
245247
model_save_dir=model_save_dir,
246248
quantization_config=quantization_config,
247-
compile=not self._compile_only,
249+
compile=not self._compile_only and enable_compilation,
248250
compile_only=self._compile_only,
249251
)
250252
self.vision_embeddings = OVVisionEmbedding(self.vision_embeddings_model, self)
@@ -264,6 +266,18 @@ def __init__(
264266
except AttributeError:
265267
pass
266268

269+
def clear_requests(self):
270+
if self._compile_only:
271+
raise ValueError(
272+
"`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option"
273+
)
274+
275+
self.language_model.clear_requests()
276+
components = [self.vision_embeddings] + [getattr(self, part) for part in self.additional_parts]
277+
for component in components:
278+
if component is not None:
279+
component.request = None
280+
267281
def compile(self):
268282
self.language_model.compile()
269283
self.vision_embeddings._compile()
@@ -281,11 +295,11 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
281295
save_directory (`str` or `Path`):
282296
The directory where to save the model files.
283297
"""
284-
src_files = [self.lm_model, self.text_embdings_model, self.vision_embeddings_model]
298+
src_files = [self.lm_model, self.text_embeddings_model, self.vision_embeddings_model]
285299
dst_file_names = [
286300
"openvino_language_model.xml",
287301
"openvino_text_embeddings_model.xml",
288-
"openvino_vision_embeddings.xml",
302+
"openvino_vision_embeddings_model.xml",
289303
]
290304
for part in self.additional_parts:
291305
model = getattr(self, f"{part}_model", None)
@@ -364,26 +378,18 @@ def _from_pretrained(
364378
raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
365379
token = use_auth_token
366380

367-
model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
368-
369-
quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
370-
compile_only = kwargs.get("compile_only", False)
371-
372-
# Load model from a local directory
373-
if os.path.isdir(model_id):
374-
model_save_dir = Path(model_id)
375381
model_file_names = {
376382
"language_model": "openvino_language_model.xml",
377383
"text_embeddings": "openvino_text_embeddings_model.xml",
378384
"vision_embeddings": "openvino_vision_embeddings_model.xml",
379385
}
380386

387+
model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
381388
for part in model_cls.additional_parts:
382389
model_file_names[part] = f"openvino_{part}_model.xml"
383-
model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
384-
quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
385390
compile_only = kwargs.get("compile_only", False)
386391
if os.path.isdir(model_id):
392+
# Load model from a local directory
387393
model_save_dir = Path(model_id)
388394
file_names = {k: os.path.join(model_id, model_file_names[k]) for k in model_file_names}
389395
else:
@@ -401,11 +407,11 @@ def _from_pretrained(
401407
file_names[name] = model_cache_path
402408
model_save_dir = Path(model_cache_path).parent
403409
if not compile_only:
404-
language_model = model_cls.load_model(file_names["language_model"], quantization_config)
405-
text_embeddings = model_cls.load_model(file_names["text_embeddings"], quantization_config)
406-
vision_embeddings = model_cls.load_model(file_names["vision_embeddings"], quantization_config)
410+
language_model = model_cls.load_model(file_names["language_model"])
411+
text_embeddings = model_cls.load_model(file_names["text_embeddings"])
412+
vision_embeddings = model_cls.load_model(file_names["vision_embeddings"])
407413
for part in model_cls.additional_parts:
408-
kwargs[part] = model_cls.load_model(file_names[part], quantization_config)
414+
kwargs[part] = model_cls.load_model(file_names[part])
409415
else:
410416
language_model = model_cls._compile_model(
411417
file_names["language_model"],
@@ -445,7 +451,12 @@ def _from_pretrained(
445451
except Exception:
446452
pass
447453

448-
return model_cls(
454+
quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
455+
to_quantize = not compile_only and quantization_config is not None
456+
if to_quantize:
457+
kwargs["compile"] = False
458+
459+
model = model_cls(
449460
language_model=language_model,
450461
text_embeddings=text_embeddings,
451462
vision_embeddings=vision_embeddings,
@@ -455,6 +466,11 @@ def _from_pretrained(
455466
**kwargs,
456467
)
457468

469+
if to_quantize:
470+
OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config))
471+
472+
return model
473+
458474
@classmethod
459475
def _from_transformers(
460476
cls,
@@ -533,8 +549,8 @@ def half(self):
533549
"""
534550
apply_moc_transformations(self.lm_model, cf=False)
535551
compress_model_transformation(self.lm_model)
536-
apply_moc_transformations(self.text_embdings_model, cf=False)
537-
compress_model_transformation(self.text_embdings_model)
552+
apply_moc_transformations(self.text_embeddings_model, cf=False)
553+
compress_model_transformation(self.text_embeddings_model)
538554
apply_moc_transformations(self.vision_embeddings_model, cf=False)
539555
compress_model_transformation(self.vision_embeddings_model)
540556
for part in self.additional_parts:

‎optimum/intel/openvino/quantization.py

+72-1
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
from pathlib import Path
2323
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
2424

25+
import datasets
2526
import nncf
2627
import openvino
28+
import requests
2729
import torch
2830
import transformers
2931
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
@@ -33,9 +35,11 @@
3335
from nncf.torch.initialization import PTInitializingDataLoader
3436
from openvino._offline_transformations import compress_quantize_weights_transformation
3537
from openvino.runtime import Core, Tensor
38+
from PIL import Image
3639
from torch.utils._pytree import tree_map
3740
from torch.utils.data import DataLoader, RandomSampler
38-
from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
41+
from tqdm import tqdm
42+
from transformers import AutoProcessor, AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
3943
from transformers.pytorch_utils import Conv1D
4044
from transformers.utils import is_accelerate_available
4145

@@ -62,6 +66,7 @@
6266
ONNX_WEIGHTS_NAME,
6367
OV_XML_FILE_NAME,
6468
PREDEFINED_SD_DATASETS,
69+
PREDEFINED_VISUAL_LM_DATASETS,
6570
)
6671

6772

@@ -313,6 +318,8 @@ def _quantize_ovbasemodel(
313318
remove_unused_columns: bool = True,
314319
**kwargs,
315320
):
321+
from optimum.intel.openvino.modeling_visual_language import OVModelForVisualCausalLM
322+
316323
if is_diffusers_available():
317324
from optimum.intel.openvino.modeling_diffusion import OVDiffusionPipeline
318325

@@ -361,6 +368,8 @@ def _quantize_ovbasemodel(
361368

362369
if isinstance(self.model, OVModelForCausalLM):
363370
calibration_dataset = self._prepare_causal_lm_dataset(quantization_config)
371+
elif isinstance(self.model, OVModelForVisualCausalLM):
372+
calibration_dataset = self._prepare_visual_causal_lm_dataset(quantization_config)
364373
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
365374
if not isinstance(quantization_config.dataset, str):
366375
raise ValueError("Please provide dataset as one of the accepted dataset labels.")
@@ -401,6 +410,14 @@ def _quantize_ovbasemodel(
401410
for sub_model in sub_models:
402411
_weight_only_quantization(sub_model.model, quantization_config)
403412
self.model.clear_requests()
413+
elif isinstance(self.model, OVModelForVisualCausalLM):
414+
language_model = self.model.language_model
415+
_weight_only_quantization(language_model.model, quantization_config, calibration_dataset)
416+
sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts
417+
sub_models = [getattr(self.model, f"{name}_model") for name in sub_model_names]
418+
for sub_model in sub_models:
419+
_weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=False))
420+
self.model.clear_requests()
404421
else:
405422
_weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
406423
self.model.request = None
@@ -713,6 +730,60 @@ def _prepare_causal_lm_dataset(self, quantization_config: OVWeightQuantizationCo
713730

714731
return calibration_dataset
715732

733+
def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig, max_tokens=32):
734+
dataset_name = config.dataset
735+
if dataset_name not in PREDEFINED_VISUAL_LM_DATASETS:
736+
raise ValueError(
737+
"You have entered a string value for dataset. You can only choose between"
738+
f"{list(PREDEFINED_VISUAL_LM_DATASETS.keys())}, but the {dataset_name} was found"
739+
)
740+
741+
dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS[dataset_name]
742+
dataset = datasets.load_dataset(dataset_metadata["name"], split=dataset_metadata["split"]).shuffle(seed=0)
743+
num_samples = min(config.num_samples or 128, len(dataset))
744+
745+
calibration_dataset = []
746+
processor = AutoProcessor.from_pretrained(config.processor, trust_remote_code=config.trust_remote_code)
747+
pbar = tqdm(desc="Collecting calibration dataset", total=num_samples)
748+
for item in dataset:
749+
image_url = item[dataset_metadata["inputs"]["image_url"]]
750+
instruction = item[dataset_metadata["inputs"]["instruction"]]
751+
image = Image.open(requests.get(image_url, stream=True).raw)
752+
753+
chat_template = [{"role": "user", "content": [{"type": "text", "text": instruction}, {"type": "image"}]}]
754+
prompt = processor.apply_chat_template(chat_template, add_generation_prompt=True)
755+
756+
inputs = processor(images=image, text=prompt, return_tensors="pt")
757+
if inputs.input_ids.size(1) > max_tokens:
758+
continue
759+
input_ids = inputs.input_ids
760+
attention_mask = inputs.attention_mask
761+
position_ids = torch.arange(attention_mask.size(1)).unsqueeze(0).to(attention_mask.device)
762+
pixel_values = inputs.pixel_values
763+
image_sizes = inputs.image_sizes
764+
765+
inputs_embeds, attention_mask, position_ids = self.model.get_multimodal_embeddings(
766+
input_ids,
767+
pixel_values,
768+
image_sizes=image_sizes,
769+
attention_mask=attention_mask,
770+
position_ids=position_ids,
771+
)
772+
773+
language_model_inputs = self.model.language_model.prepare_inputs(
774+
input_ids=None,
775+
attention_mask=attention_mask,
776+
position_ids=position_ids,
777+
inputs_embeds=inputs_embeds,
778+
)
779+
pbar.update(1)
780+
calibration_dataset.append(language_model_inputs)
781+
if len(calibration_dataset) == num_samples:
782+
break
783+
784+
calibration_dataset = nncf.Dataset(calibration_dataset)
785+
return calibration_dataset
786+
716787
def _prepare_text_generation_dataset(
717788
self, quantization_config: OVQuantizationConfig, calibration_dataloader: OVDataLoader
718789
) -> nncf.Dataset:

‎optimum/intel/openvino/utils.py

+8
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,14 @@
131131
"laion/filtered-wit": {"split": "train", "inputs": {"prompt": "caption"}},
132132
}
133133

134+
PREDEFINED_VISUAL_LM_DATASETS = {
135+
"contextual": {
136+
"name": "ucla-contextual/contextual_test",
137+
"split": "test",
138+
"inputs": {"image_url": "image_url", "instruction": "instruction"},
139+
}
140+
}
141+
134142

135143
NEED_CONVERT_TO_FAST_TOKENIZER: Tuple[Type[PreTrainedTokenizer]] = (CLIPTokenizer,)
136144

0 commit comments

Comments
 (0)
Please sign in to comment.