Skip to content

Commit 0fea8f4

Browse files
Reorder methods
1 parent 2ece9a0 commit 0fea8f4

File tree

1 file changed

+97
-97
lines changed

1 file changed

+97
-97
lines changed

optimum/intel/openvino/quantization/calibration_dataset_builder.py

+97-97
Original file line numberDiff line numberDiff line change
@@ -205,71 +205,76 @@ def __init__(self, model: transformers.PreTrainedModel, seed: int = 42):
205205
signature = inspect.signature(self.model.forward)
206206
self._signature_columns = list(signature.parameters.keys())
207207

208-
def build_from_dataset(
209-
self,
210-
quantization_config: OVQuantizationConfigBase,
211-
dataset: Union["Dataset", List],
212-
batch_size: Optional[int] = 1,
213-
data_collator: Optional[DataCollator] = None,
214-
remove_unused_columns: bool = False,
215-
) -> CalibrationDataset:
208+
def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> CalibrationDataset:
216209
"""
210+
Builds a calibration dataset from a quantization config object. Namely, `quantization_config.dataset` property
211+
is used to infer dataset name.
217212
218213
Args:
219-
quantization_config (`OVQuantizationConfigBase`):
214+
config (`OVQuantizationConfigBase`):
220215
The quantization configuration object.
221-
dataset (`Union[datasets.Dataset, List]`):
222-
The dataset to collect calibration data from.
223-
batch_size (`int`, defaults to 1):
224-
The number of calibration samples to load per batch. Not always used.
225-
data_collator (`DataCollator`, *optional*):
226-
The function to use to form a batch from a list of elements of the calibration dataset. Not always used.
227-
remove_unused_columns (`bool`, defaults to `False`):
228-
Whether to remove the columns unused by the model forward method. Not always used.
229216
Returns:
230217
A calibration dataset as an instance of `CalibrationDataset` containing an `nncf.Dataset` for each model component.
231218
"""
232-
from optimum.intel import OVModelForVisualCausalLM
233-
from optimum.intel.openvino.modeling_decoder import OVBaseDecoderModel
219+
from optimum.intel import OVModelForCausalLM, OVModelForVisualCausalLM
234220
from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper
235221

236222
if is_diffusers_available():
237223
from optimum.intel.openvino.modeling_diffusion import OVDiffusionPipeline
238224

239-
if isinstance(dataset, list):
240-
logger.warning(
241-
"Providing dataset as a list is deprecated and will be removed in optimum-intel v1.25. "
242-
"Please provide it as `datasets.Dataset`."
243-
)
225+
if config.dataset is None:
226+
raise ValueError("Please provide a dataset for calibration.")
244227

245-
if isinstance(self.model, (OVModelForVisualCausalLM, _OVModelForWhisper)) or (
246-
is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline)
247-
):
248-
# Prepare from raw dataset avoiding dataloader creation
249-
if batch_size != 1 or data_collator is not None or remove_unused_columns:
250-
logger.warning(
251-
"`batch_size`, `data_collator` and `remove_unused_columns` are not supported for this type of model."
228+
if isinstance(self.model, OVModelForCausalLM):
229+
return self._prepare_causal_lm_calibration_data(config)
230+
elif isinstance(self.model, (OVModelForVisualCausalLM, _OVModelForWhisper)):
231+
if config.processor is None:
232+
raise ValueError(
233+
"`processor` must be specified in order to run data-aware quantization. Please provide it as a"
234+
"model id, or a path to a directory containing all the required configuration files."
252235
)
253236

254237
if isinstance(self.model, OVModelForVisualCausalLM):
255-
return self._prepare_visual_causal_lm_calibration_data(quantization_config, dataset)
238+
dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS[config.dataset]
239+
return self.build_from_dataset_name(
240+
config,
241+
dataset_metadata["id"],
242+
num_samples=config.num_samples,
243+
dataset_split=dataset_metadata["split"],
244+
trust_remote_code=config.trust_remote_code,
245+
)
256246
elif isinstance(self.model, _OVModelForWhisper):
257-
return self._prepare_speech_to_text_calibration_data(quantization_config, dataset)
258-
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
259-
return self._prepare_diffusion_calibration_data(quantization_config, dataset)
247+
dataset_metadata = PREDEFINED_SPEECH_TO_TEXT_DATASETS[config.dataset]
248+
return self.build_from_dataset_name(
249+
config,
250+
dataset_metadata["id"],
251+
num_samples=config.num_samples, # This is an upper bound on how many audios are needed
252+
dataset_config_name=dataset_metadata["name"],
253+
dataset_split=dataset_metadata["split"],
254+
trust_remote_code=config.trust_remote_code,
255+
streaming=dataset_metadata["streaming"],
256+
)
260257
else:
261-
raise RuntimeError("Unsupported model type for calibration dataset collection.")
262-
else:
263-
# Prepare from dataloader
264-
# Setting `remove_unused_columns=True` until it is not deprecated
265-
dataloader = self._get_calibration_dataloader(
266-
dataset, batch_size, data_collator, remove_unused_columns=True
267-
)
268-
if isinstance(self.model, OVBaseDecoderModel):
269-
return self._prepare_decoder_calibration_data(quantization_config, dataloader)
258+
raise Exception
259+
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
260+
if isinstance(config.dataset, str):
261+
dataset_name = config.dataset
262+
dataset_metadata = PREDEFINED_DIFFUSION_DATASETS[dataset_name]
263+
264+
dataset = self.load_dataset(
265+
dataset_name,
266+
num_samples=config.num_samples, # This is an upper bound on how many prompts are needed
267+
dataset_split=dataset_metadata["split"],
268+
streaming=dataset_metadata["streaming"],
269+
)
270+
elif isinstance(config.dataset, list) and all(isinstance(it, str) for it in config.dataset):
271+
dataset = config.dataset
270272
else:
271-
# Assuming this is the torch model quantization scenario
272-
return CalibrationDataset({"model": nncf.Dataset(dataloader)})
273+
raise RuntimeError(
274+
"Please provide dataset as one of the accepted dataset labels or as a list of string prompts."
275+
)
276+
277+
return self.build_from_dataset(config, dataset)
273278

274279
def build_from_dataset_name(
275280
self,
@@ -346,76 +351,71 @@ def build_from_dataset_name(
346351

347352
return self.build_from_dataset(quantization_config, dataset, batch_size, data_collator, remove_unused_columns)
348353

349-
def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> CalibrationDataset:
354+
def build_from_dataset(
355+
self,
356+
quantization_config: OVQuantizationConfigBase,
357+
dataset: Union["Dataset", List],
358+
batch_size: Optional[int] = 1,
359+
data_collator: Optional[DataCollator] = None,
360+
remove_unused_columns: bool = False,
361+
) -> CalibrationDataset:
350362
"""
351-
Builds a calibration dataset from a quantization config object. Namely, `quantization_config.dataset` property
352-
is used to infer dataset name.
353363
354364
Args:
355-
config (`OVQuantizationConfigBase`):
365+
quantization_config (`OVQuantizationConfigBase`):
356366
The quantization configuration object.
367+
dataset (`Union[datasets.Dataset, List]`):
368+
The dataset to collect calibration data from.
369+
batch_size (`int`, defaults to 1):
370+
The number of calibration samples to load per batch. Not always used.
371+
data_collator (`DataCollator`, *optional*):
372+
The function to use to form a batch from a list of elements of the calibration dataset. Not always used.
373+
remove_unused_columns (`bool`, defaults to `False`):
374+
Whether to remove the columns unused by the model forward method. Not always used.
357375
Returns:
358376
A calibration dataset as an instance of `CalibrationDataset` containing an `nncf.Dataset` for each model component.
359377
"""
360-
from optimum.intel import OVModelForCausalLM, OVModelForVisualCausalLM
378+
from optimum.intel import OVModelForVisualCausalLM
379+
from optimum.intel.openvino.modeling_decoder import OVBaseDecoderModel
361380
from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper
362381

363382
if is_diffusers_available():
364383
from optimum.intel.openvino.modeling_diffusion import OVDiffusionPipeline
365384

366-
if config.dataset is None:
367-
raise ValueError("Please provide a dataset for calibration.")
385+
if isinstance(dataset, list):
386+
logger.warning(
387+
"Providing dataset as a list is deprecated and will be removed in optimum-intel v1.25. "
388+
"Please provide it as `datasets.Dataset`."
389+
)
368390

369-
if isinstance(self.model, OVModelForCausalLM):
370-
return self._prepare_causal_lm_calibration_data(config)
371-
elif isinstance(self.model, (OVModelForVisualCausalLM, _OVModelForWhisper)):
372-
if config.processor is None:
373-
raise ValueError(
374-
"`processor` must be specified in order to run data-aware quantization. Please provide it as a"
375-
"model id, or a path to a directory containing all the required configuration files."
391+
if isinstance(self.model, (OVModelForVisualCausalLM, _OVModelForWhisper)) or (
392+
is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline)
393+
):
394+
# Prepare from raw dataset avoiding dataloader creation
395+
if batch_size != 1 or data_collator is not None or remove_unused_columns:
396+
logger.warning(
397+
"`batch_size`, `data_collator` and `remove_unused_columns` are not supported for this type of model."
376398
)
377399

378400
if isinstance(self.model, OVModelForVisualCausalLM):
379-
dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS[config.dataset]
380-
return self.build_from_dataset_name(
381-
config,
382-
dataset_metadata["id"],
383-
num_samples=config.num_samples,
384-
dataset_split=dataset_metadata["split"],
385-
trust_remote_code=config.trust_remote_code,
386-
)
401+
return self._prepare_visual_causal_lm_calibration_data(quantization_config, dataset)
387402
elif isinstance(self.model, _OVModelForWhisper):
388-
dataset_metadata = PREDEFINED_SPEECH_TO_TEXT_DATASETS[config.dataset]
389-
return self.build_from_dataset_name(
390-
config,
391-
dataset_metadata["id"],
392-
num_samples=config.num_samples, # This is an upper bound on how many audios are needed
393-
dataset_config_name=dataset_metadata["name"],
394-
dataset_split=dataset_metadata["split"],
395-
trust_remote_code=config.trust_remote_code,
396-
streaming=dataset_metadata["streaming"],
397-
)
403+
return self._prepare_speech_to_text_calibration_data(quantization_config, dataset)
404+
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
405+
return self._prepare_diffusion_calibration_data(quantization_config, dataset)
398406
else:
399-
raise Exception
400-
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
401-
if isinstance(config.dataset, str):
402-
dataset_name = config.dataset
403-
dataset_metadata = PREDEFINED_DIFFUSION_DATASETS[dataset_name]
404-
405-
dataset = self.load_dataset(
406-
dataset_name,
407-
num_samples=config.num_samples, # This is an upper bound on how many prompts are needed
408-
dataset_split=dataset_metadata["split"],
409-
streaming=dataset_metadata["streaming"],
410-
)
411-
elif isinstance(config.dataset, list) and all(isinstance(it, str) for it in config.dataset):
412-
dataset = config.dataset
407+
raise RuntimeError("Unsupported model type for calibration dataset collection.")
408+
else:
409+
# Prepare from dataloader
410+
# Setting `remove_unused_columns=True` until it is not deprecated
411+
dataloader = self._get_calibration_dataloader(
412+
dataset, batch_size, data_collator, remove_unused_columns=True
413+
)
414+
if isinstance(self.model, OVBaseDecoderModel):
415+
return self._prepare_decoder_calibration_data(quantization_config, dataloader)
413416
else:
414-
raise RuntimeError(
415-
"Please provide dataset as one of the accepted dataset labels or as a list of string prompts."
416-
)
417-
418-
return self.build_from_dataset(config, dataset)
417+
# Assuming this is the torch model quantization scenario
418+
return CalibrationDataset({"model": nncf.Dataset(dataloader)})
419419

420420
def load_dataset(
421421
self,

0 commit comments

Comments
 (0)