Skip to content

Commit 05dde92

Browse files
Refator custom dataset building: use dataset instead of dataloader
1 parent 14db64b commit 05dde92

File tree

4 files changed

+92
-106
lines changed

4 files changed

+92
-106
lines changed

optimum/intel/openvino/modeling_visual_language.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1604,7 +1604,7 @@ def resampling(self, x, tgt_sizes):
16041604

16051605
def _set_2d_pos_cache(self, max_size):
16061606
pos_embed = torch.from_numpy(self._get_2d_sincos_pos_embed(self.embed_dim, max_size)).float()
1607-
self._pos_embed = pos_embed
1607+
self._pos_embeds = pos_embed
16081608

16091609
def _adjust_pos_cache(self, tgt_sizes):
16101610
max_h = torch.max(tgt_sizes[:, 0])

optimum/intel/openvino/quantization/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,5 @@
2727

2828

2929
if is_nncf_available():
30+
# Quantization is possible only if nncf is installed
3031
from .quantizer import OVQuantizer

optimum/intel/openvino/quantization/calibration_dataset_builder.py

+83-100
Original file line numberDiff line numberDiff line change
@@ -173,19 +173,32 @@ def build_from_dataset(
173173
if is_diffusers_available():
174174
from optimum.intel.openvino.modeling_diffusion import OVDiffusionPipeline
175175

176-
dataloader = self._get_calibration_dataloader(dataset, batch_size, data_collator, remove_unused_columns)
177-
178-
if isinstance(self.model, OVBaseDecoderModel):
179-
return self._prepare_decoder_calibration_data(quantization_config, dataloader)
180-
elif isinstance(self.model, OVModelForVisualCausalLM):
181-
return self._prepare_visual_causal_lm_calibration_data(quantization_config, dataloader)
182-
elif isinstance(self.model, _OVModelForWhisper):
183-
return self._prepare_speech_to_text_calibration_data(quantization_config, dataloader)
184-
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
185-
return self._prepare_diffusion_calibration_data(quantization_config, dataloader)
176+
if isinstance(self.model, (OVModelForVisualCausalLM, _OVModelForWhisper)) or (
177+
is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline)
178+
):
179+
# Prepare from raw dataset avoiding dataloader creation
180+
if batch_size != 1 or data_collator is not None or remove_unused_columns:
181+
logger.warning(
182+
"`batch_size`, `data_collator` and `remove_unused_columns` are not supported for this type of model."
183+
)
184+
185+
if isinstance(self.model, OVModelForVisualCausalLM):
186+
return self._prepare_visual_causal_lm_calibration_data(quantization_config, dataset)
187+
elif isinstance(self.model, _OVModelForWhisper):
188+
return self._prepare_speech_to_text_calibration_data(quantization_config, dataset)
189+
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
190+
return self._prepare_diffusion_calibration_data(quantization_config, dataset)
191+
else:
192+
# TODO
193+
raise Exception()
186194
else:
187-
# Torch model quantization scenario
188-
return {"model": nncf.Dataset(dataloader)}
195+
# Prepare from dataloader
196+
dataloader = self._get_calibration_dataloader(dataset, batch_size, data_collator, remove_unused_columns)
197+
if isinstance(self.model, OVBaseDecoderModel):
198+
return self._prepare_decoder_calibration_data(quantization_config, dataloader)
199+
else:
200+
# Torch model quantization scenario
201+
return {"model": nncf.Dataset(dataloader)}
189202

190203
def build_from_dataset_name(
191204
self,
@@ -257,6 +270,9 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> Di
257270
if is_diffusers_available():
258271
from optimum.intel.openvino.modeling_diffusion import OVDiffusionPipeline
259272

273+
if config.dataset is None:
274+
raise ValueError("Please provide a dataset for calibration.")
275+
260276
if isinstance(self.model, OVModelForCausalLM):
261277
return self._prepare_causal_lm_calibration_data(config)
262278
elif isinstance(self.model, (OVModelForVisualCausalLM, _OVModelForWhisper)):
@@ -266,108 +282,46 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> Di
266282
"model id, or a path to a directory containing all the required configuration files."
267283
)
268284

269-
trc = config.trust_remote_code
270-
processor = AutoProcessor.from_pretrained(config.processor, trust_remote_code=trc)
271285
if isinstance(self.model, OVModelForVisualCausalLM):
272-
try:
273-
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=trc)
274-
tokenizer_error = None
275-
except Exception as tokenizer_error: # noqa: F841
276-
tokenizer = None
277-
278286
dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS[config.dataset]
279-
280-
def preprocess_function(item):
281-
inputs_metadata = dataset_metadata["inputs"]
282-
instruction = item[inputs_metadata["instruction"]]
283-
image_url = item[inputs_metadata["image_url"]]
284-
285-
image = Image.open(requests.get(image_url, stream=True).raw)
286-
287-
try:
288-
inputs = self.model.preprocess_inputs(
289-
text=instruction,
290-
image=image,
291-
processor=processor,
292-
tokenizer=tokenizer,
293-
config=self.model.config,
294-
)
295-
# Remove batch dimension
296-
for key in inputs.keys():
297-
inputs[key] = inputs[key][0]
298-
except ValueError as value_error:
299-
if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
300-
raise tokenizer_error
301-
raise value_error
302-
303-
return inputs
304-
305287
return self.build_from_dataset_name(
306288
config,
307289
dataset_metadata["id"],
308290
num_samples=config.num_samples,
309291
dataset_split=dataset_metadata["split"],
310-
preprocess_function=preprocess_function,
311-
preprocess_batch=False,
312-
trust_remote_code=trc,
292+
trust_remote_code=config.trust_remote_code,
313293
)
314294
elif isinstance(self.model, _OVModelForWhisper):
315295
dataset_metadata = PREDEFINED_SPEECH_TO_TEXT_DATASETS[config.dataset]
316-
317-
def preprocess_function(item):
318-
audio = item["audio"]["array"]
319-
sampling_rate = item["audio"]["sampling_rate"]
320-
inputs = processor(audio, sampling_rate=sampling_rate, return_tensors="pt")
321-
# This way key "audio" in the original dict will be overridden and not cause problems
322-
return {"audio": inputs.input_features[0]}
323-
324296
return self.build_from_dataset_name(
325297
config,
326298
dataset_metadata["id"],
327299
num_samples=config.num_samples, # This is an upper bound on how many audios are needed
328300
dataset_config_name=dataset_metadata["name"],
329301
dataset_split=dataset_metadata["split"],
330-
preprocess_function=preprocess_function,
331-
preprocess_batch=False,
332-
trust_remote_code=trc,
302+
trust_remote_code=config.trust_remote_code,
333303
streaming=dataset_metadata["streaming"],
334304
)
335305
else:
336306
raise Exception
337307
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
338-
dataset = config.dataset
339-
340-
dataset_metadata = None
341-
if isinstance(dataset, str):
342-
dataset_name = dataset
308+
if isinstance(config.dataset, str):
309+
dataset_name = config.dataset
343310
dataset_metadata = PREDEFINED_DIFFUSION_DATASETS[dataset_name]
344311

345-
def preprocess_function(item):
346-
return {"prompt": item[dataset_metadata["prompt_column_name"]]}
347-
348312
dataset = self.load_dataset(
349313
dataset_name,
350314
num_samples=config.num_samples, # This is an upper bound on how many prompts are needed
351315
dataset_split=dataset_metadata["split"],
352-
preprocess_function=preprocess_function,
353-
preprocess_batch=False,
354316
streaming=dataset_metadata["streaming"],
355317
)
356-
elif not (isinstance(dataset, list) and all(isinstance(it, str) for it in dataset)):
357-
raise Exception
358-
359-
def collate_fn(features):
360-
first = features[0]
361-
if isinstance(first, dict) and dataset_metadata is not None:
362-
# List of dicts case
363-
batch = [it["prompt"] for it in features]
364-
else:
365-
# List of strings case
366-
# TODO: ?
367-
batch = features
368-
return batch
318+
elif isinstance(config.dataset, list) and all(isinstance(it, str) for it in config.dataset):
319+
dataset = config.dataset
320+
else:
321+
# TODO
322+
raise Exception()
369323

370-
return self.build_from_dataset(config, dataset, data_collator=collate_fn)
324+
return self.build_from_dataset(config, dataset)
371325

372326
def load_dataset(
373327
self,
@@ -514,11 +468,33 @@ def _prepare_causal_lm_calibration_data(
514468
return {"model": calibration_dataset}
515469

516470
def _prepare_visual_causal_lm_calibration_data(
517-
self, quantization_config: OVQuantizationConfigBase, dataloader: OVDataLoader
471+
self, config: OVQuantizationConfigBase, dataset: "Dataset"
518472
) -> Dict[str, nncf.Dataset]:
473+
processor = AutoProcessor.from_pretrained(config.processor, trust_remote_code=config.trust_remote_code)
474+
try:
475+
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code)
476+
tokenizer_error = None
477+
except Exception as tokenizer_error: # noqa: F841
478+
tokenizer = None
479+
480+
dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS[config.dataset]
481+
519482
calibration_data = []
520-
num_samples = quantization_config.num_samples or 32
521-
for inputs in tqdm(dataloader, desc="Collecting calibration dataset", total=num_samples):
483+
num_samples = config.num_samples or 32
484+
for item in tqdm(dataset, desc="Collecting calibration dataset", total=num_samples):
485+
instruction = item[dataset_metadata["inputs"]["instruction"]]
486+
image_url = item[dataset_metadata["inputs"]["image_url"]]
487+
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
488+
489+
try:
490+
inputs = self.model.preprocess_inputs(
491+
text=instruction, image=image, processor=processor, tokenizer=tokenizer, config=self.model.config
492+
)
493+
except ValueError as value_error:
494+
if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
495+
raise tokenizer_error
496+
raise value_error
497+
522498
input_ids = inputs.get("input_ids")
523499
position_ids = torch.arange(input_ids.size(1)).unsqueeze(0).to(input_ids.device)
524500

@@ -542,7 +518,7 @@ def _prepare_visual_causal_lm_calibration_data(
542518
return {"lm_model": nncf.Dataset(calibration_data)}
543519

544520
def _prepare_speech_to_text_calibration_data(
545-
self, quantization_config: OVQuantizationConfigBase, dataloader: OVDataLoader
521+
self, config: OVQuantizationConfigBase, dataset: "Dataset"
546522
) -> Dict[str, nncf.Dataset]:
547523
from optimum.intel.openvino.modeling_seq2seq import OVDecoder, OVEncoder
548524

@@ -559,12 +535,17 @@ def _prepare_speech_to_text_calibration_data(
559535
)
560536

561537
try:
562-
# Download audio inputs beforehand to avoid possible connection issues
563-
num_samples = quantization_config.num_samples or 32
564-
audio_inputs = list(tqdm(dataloader, desc="Downloading audio inputs", total=num_samples))
538+
processor = AutoProcessor.from_pretrained(config.processor, trust_remote_code=config.trust_remote_code)
565539

566-
for inputs in tqdm(audio_inputs, desc="Collecting calibration data"):
567-
self.model.generate(inputs["audio"])
540+
# Download audio inputs beforehand to avoid possible connection issues
541+
num_samples = config.num_samples or 32
542+
downloaded_dataset = list(tqdm(dataset, desc="Downloading audio inputs", total=num_samples))
543+
544+
for item in tqdm(downloaded_dataset, desc="Collecting calibration data"):
545+
audio = item["audio"]["array"]
546+
sampling_rate = item["audio"]["sampling_rate"]
547+
input_features = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_features
548+
self.model.generate(input_features)
568549
finally:
569550
for model in models.values():
570551
model.request = model.request.request
@@ -575,7 +556,7 @@ def _prepare_speech_to_text_calibration_data(
575556
return calibration_data
576557

577558
def _prepare_diffusion_calibration_data(
578-
self, quantization_config: OVQuantizationConfigBase, dataloader: OVDataLoader
559+
self, config: OVQuantizationConfigBase, dataset: "Dataset"
579560
) -> Dict[str, nncf.Dataset]:
580561
self.model.compile()
581562

@@ -585,16 +566,18 @@ def _prepare_diffusion_calibration_data(
585566
size = diffuser.config.get("sample_size", 64) * self.model.vae_scale_factor
586567
height, width = 2 * (min(size, 512),)
587568

588-
num_samples = quantization_config.num_samples or 200
569+
num_samples = config.num_samples or 200
589570
calibration_data = []
590571
try:
591572
diffuser.request = InferRequestWrapper(diffuser.request, calibration_data)
592573

593-
for inputs in tqdm(dataloader, desc="Collecting calibration data"):
594-
if isinstance(inputs, dict):
595-
self.model(**inputs, height=height, width=width)
596-
else:
597-
self.model(inputs, height=height, width=width)
574+
for item in tqdm(dataset, desc="Collecting calibration data"):
575+
prompt = (
576+
item[PREDEFINED_DIFFUSION_DATASETS[config.dataset]["prompt_column_name"]]
577+
if isinstance(item, dict)
578+
else item
579+
)
580+
self.model(prompt, height=height, width=width)
598581
if len(calibration_data) >= num_samples:
599582
break
600583
finally:

optimum/intel/openvino/quantization/quantizer.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,7 @@ def quantize(
153153
ov_config = OVConfig()
154154
if not isinstance(ov_config, OVConfig):
155155
raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.")
156-
quantization_config = ov_config.quantization_config
157-
if quantization_config is None:
156+
if ov_config.quantization_config is None:
158157
logger.warning(
159158
"`quantization_config` was not provided. In the future, please provide `quantization_config`"
160159
)
@@ -167,6 +166,7 @@ def quantize(
167166

168167
# TODO: add deprecation warning for Sized dataset
169168

169+
quantization_config = ov_config.quantization_config
170170
if quantization_config.dataset is not None and calibration_dataset is not None:
171171
logger.info(
172172
"Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only "
@@ -188,7 +188,7 @@ def quantize(
188188
and isinstance(calibration_dataset, Dataset)
189189
and "caption" in calibration_dataset.column_names
190190
):
191-
# TODO: analyze this execution path
191+
# TODO: deprecate this path
192192
calibration_dataset = calibration_dataset.select_columns(["caption"])
193193

194194
if (
@@ -198,10 +198,12 @@ def quantize(
198198
and all(isinstance(it, str) for it in calibration_dataset)
199199
):
200200
# TODO: deprecate this way of providing calibration dataset
201-
data_collator = data_collator or (lambda x: x)
201+
if quantization_config.dataset is not None:
202+
raise Exception()
203+
quantization_config.dataset = calibration_dataset
202204

203205
calibration_dataset = self.dataset_builder.build_from_dataset(
204-
quantization_config, calibration_dataset, batch_size, data_collator, remove_unused_columns
206+
quantization_config, calibration_dataset, batch_size, remove_unused_columns
205207
)
206208

207209
from ..modeling_base import OVBaseModel

0 commit comments

Comments
 (0)