@@ -205,71 +205,76 @@ def __init__(self, model: transformers.PreTrainedModel, seed: int = 42):
205
205
signature = inspect .signature (self .model .forward )
206
206
self ._signature_columns = list (signature .parameters .keys ())
207
207
208
- def build_from_dataset (
209
- self ,
210
- quantization_config : OVQuantizationConfigBase ,
211
- dataset : Union ["Dataset" , List ],
212
- batch_size : Optional [int ] = 1 ,
213
- data_collator : Optional [DataCollator ] = None ,
214
- remove_unused_columns : bool = False ,
215
- ) -> CalibrationDataset :
208
+ def build_from_quantization_config (self , config : OVQuantizationConfigBase ) -> CalibrationDataset :
216
209
"""
210
+ Builds a calibration dataset from a quantization config object. Namely, `quantization_config.dataset` property
211
+ is used to infer dataset name.
217
212
218
213
Args:
219
- quantization_config (`OVQuantizationConfigBase`):
214
+ config (`OVQuantizationConfigBase`):
220
215
The quantization configuration object.
221
- dataset (`Union[datasets.Dataset, List]`):
222
- The dataset to collect calibration data from.
223
- batch_size (`int`, defaults to 1):
224
- The number of calibration samples to load per batch. Not always used.
225
- data_collator (`DataCollator`, *optional*):
226
- The function to use to form a batch from a list of elements of the calibration dataset. Not always used.
227
- remove_unused_columns (`bool`, defaults to `False`):
228
- Whether to remove the columns unused by the model forward method. Not always used.
229
216
Returns:
230
217
A calibration dataset as an instance of `CalibrationDataset` containing an `nncf.Dataset` for each model component.
231
218
"""
232
- from optimum .intel import OVModelForVisualCausalLM
233
- from optimum .intel .openvino .modeling_decoder import OVBaseDecoderModel
219
+ from optimum .intel import OVModelForCausalLM , OVModelForVisualCausalLM
234
220
from optimum .intel .openvino .modeling_seq2seq import _OVModelForWhisper
235
221
236
222
if is_diffusers_available ():
237
223
from optimum .intel .openvino .modeling_diffusion import OVDiffusionPipeline
238
224
239
- if isinstance (dataset , list ):
240
- logger .warning (
241
- "Providing dataset as a list is deprecated and will be removed in optimum-intel v1.25. "
242
- "Please provide it as `datasets.Dataset`."
243
- )
225
+ if config .dataset is None :
226
+ raise ValueError ("Please provide a dataset for calibration." )
244
227
245
- if isinstance (self .model , ( OVModelForVisualCausalLM , _OVModelForWhisper )) or (
246
- is_diffusers_available () and isinstance ( self .model , OVDiffusionPipeline )
247
- ):
248
- # Prepare from raw dataset avoiding dataloader creation
249
- if batch_size != 1 or data_collator is not None or remove_unused_columns :
250
- logger . warning (
251
- "`batch_size`, `data_collator` and `remove_unused_columns` are not supported for this type of model ."
228
+ if isinstance (self .model , OVModelForCausalLM ):
229
+ return self ._prepare_causal_lm_calibration_data ( config )
230
+ elif isinstance ( self . model , ( OVModelForVisualCausalLM , _OVModelForWhisper ) ):
231
+ if config . processor is None :
232
+ raise ValueError (
233
+ "`processor` must be specified in order to run data-aware quantization. Please provide it as a"
234
+ "model id, or a path to a directory containing all the required configuration files ."
252
235
)
253
236
254
237
if isinstance (self .model , OVModelForVisualCausalLM ):
255
- return self ._prepare_visual_causal_lm_calibration_data (quantization_config , dataset )
238
+ dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS [config .dataset ]
239
+ return self .build_from_dataset_name (
240
+ config ,
241
+ dataset_metadata ["id" ],
242
+ num_samples = config .num_samples ,
243
+ dataset_split = dataset_metadata ["split" ],
244
+ trust_remote_code = config .trust_remote_code ,
245
+ )
256
246
elif isinstance (self .model , _OVModelForWhisper ):
257
- return self ._prepare_speech_to_text_calibration_data (quantization_config , dataset )
258
- elif is_diffusers_available () and isinstance (self .model , OVDiffusionPipeline ):
259
- return self ._prepare_diffusion_calibration_data (quantization_config , dataset )
247
+ dataset_metadata = PREDEFINED_SPEECH_TO_TEXT_DATASETS [config .dataset ]
248
+ return self .build_from_dataset_name (
249
+ config ,
250
+ dataset_metadata ["id" ],
251
+ num_samples = config .num_samples , # This is an upper bound on how many audios are needed
252
+ dataset_config_name = dataset_metadata ["name" ],
253
+ dataset_split = dataset_metadata ["split" ],
254
+ trust_remote_code = config .trust_remote_code ,
255
+ streaming = dataset_metadata ["streaming" ],
256
+ )
260
257
else :
261
- raise RuntimeError ("Unsupported model type for calibration dataset collection." )
262
- else :
263
- # Prepare from dataloader
264
- # Setting `remove_unused_columns=True` until it is not deprecated
265
- dataloader = self ._get_calibration_dataloader (
266
- dataset , batch_size , data_collator , remove_unused_columns = True
267
- )
268
- if isinstance (self .model , OVBaseDecoderModel ):
269
- return self ._prepare_decoder_calibration_data (quantization_config , dataloader )
258
+ raise Exception
259
+ elif is_diffusers_available () and isinstance (self .model , OVDiffusionPipeline ):
260
+ if isinstance (config .dataset , str ):
261
+ dataset_name = config .dataset
262
+ dataset_metadata = PREDEFINED_DIFFUSION_DATASETS [dataset_name ]
263
+
264
+ dataset = self .load_dataset (
265
+ dataset_name ,
266
+ num_samples = config .num_samples , # This is an upper bound on how many prompts are needed
267
+ dataset_split = dataset_metadata ["split" ],
268
+ streaming = dataset_metadata ["streaming" ],
269
+ )
270
+ elif isinstance (config .dataset , list ) and all (isinstance (it , str ) for it in config .dataset ):
271
+ dataset = config .dataset
270
272
else :
271
- # Assuming this is the torch model quantization scenario
272
- return CalibrationDataset ({"model" : nncf .Dataset (dataloader )})
273
+ raise RuntimeError (
274
+ "Please provide dataset as one of the accepted dataset labels or as a list of string prompts."
275
+ )
276
+
277
+ return self .build_from_dataset (config , dataset )
273
278
274
279
def build_from_dataset_name (
275
280
self ,
@@ -346,76 +351,71 @@ def build_from_dataset_name(
346
351
347
352
return self .build_from_dataset (quantization_config , dataset , batch_size , data_collator , remove_unused_columns )
348
353
349
- def build_from_quantization_config (self , config : OVQuantizationConfigBase ) -> CalibrationDataset :
354
+ def build_from_dataset (
355
+ self ,
356
+ quantization_config : OVQuantizationConfigBase ,
357
+ dataset : Union ["Dataset" , List ],
358
+ batch_size : Optional [int ] = 1 ,
359
+ data_collator : Optional [DataCollator ] = None ,
360
+ remove_unused_columns : bool = False ,
361
+ ) -> CalibrationDataset :
350
362
"""
351
- Builds a calibration dataset from a quantization config object. Namely, `quantization_config.dataset` property
352
- is used to infer dataset name.
353
363
354
364
Args:
355
- config (`OVQuantizationConfigBase`):
365
+ quantization_config (`OVQuantizationConfigBase`):
356
366
The quantization configuration object.
367
+ dataset (`Union[datasets.Dataset, List]`):
368
+ The dataset to collect calibration data from.
369
+ batch_size (`int`, defaults to 1):
370
+ The number of calibration samples to load per batch. Not always used.
371
+ data_collator (`DataCollator`, *optional*):
372
+ The function to use to form a batch from a list of elements of the calibration dataset. Not always used.
373
+ remove_unused_columns (`bool`, defaults to `False`):
374
+ Whether to remove the columns unused by the model forward method. Not always used.
357
375
Returns:
358
376
A calibration dataset as an instance of `CalibrationDataset` containing an `nncf.Dataset` for each model component.
359
377
"""
360
- from optimum .intel import OVModelForCausalLM , OVModelForVisualCausalLM
378
+ from optimum .intel import OVModelForVisualCausalLM
379
+ from optimum .intel .openvino .modeling_decoder import OVBaseDecoderModel
361
380
from optimum .intel .openvino .modeling_seq2seq import _OVModelForWhisper
362
381
363
382
if is_diffusers_available ():
364
383
from optimum .intel .openvino .modeling_diffusion import OVDiffusionPipeline
365
384
366
- if config .dataset is None :
367
- raise ValueError ("Please provide a dataset for calibration." )
385
+ if isinstance (dataset , list ):
386
+ logger .warning (
387
+ "Providing dataset as a list is deprecated and will be removed in optimum-intel v1.25. "
388
+ "Please provide it as `datasets.Dataset`."
389
+ )
368
390
369
- if isinstance (self .model , OVModelForCausalLM ):
370
- return self ._prepare_causal_lm_calibration_data ( config )
371
- elif isinstance ( self . model , ( OVModelForVisualCausalLM , _OVModelForWhisper ) ):
372
- if config . processor is None :
373
- raise ValueError (
374
- "`processor` must be specified in order to run data-aware quantization. Please provide it as a"
375
- "model id, or a path to a directory containing all the required configuration files ."
391
+ if isinstance (self .model , ( OVModelForVisualCausalLM , _OVModelForWhisper )) or (
392
+ is_diffusers_available () and isinstance ( self .model , OVDiffusionPipeline )
393
+ ):
394
+ # Prepare from raw dataset avoiding dataloader creation
395
+ if batch_size != 1 or data_collator is not None or remove_unused_columns :
396
+ logger . warning (
397
+ "`batch_size`, `data_collator` and `remove_unused_columns` are not supported for this type of model ."
376
398
)
377
399
378
400
if isinstance (self .model , OVModelForVisualCausalLM ):
379
- dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS [config .dataset ]
380
- return self .build_from_dataset_name (
381
- config ,
382
- dataset_metadata ["id" ],
383
- num_samples = config .num_samples ,
384
- dataset_split = dataset_metadata ["split" ],
385
- trust_remote_code = config .trust_remote_code ,
386
- )
401
+ return self ._prepare_visual_causal_lm_calibration_data (quantization_config , dataset )
387
402
elif isinstance (self .model , _OVModelForWhisper ):
388
- dataset_metadata = PREDEFINED_SPEECH_TO_TEXT_DATASETS [config .dataset ]
389
- return self .build_from_dataset_name (
390
- config ,
391
- dataset_metadata ["id" ],
392
- num_samples = config .num_samples , # This is an upper bound on how many audios are needed
393
- dataset_config_name = dataset_metadata ["name" ],
394
- dataset_split = dataset_metadata ["split" ],
395
- trust_remote_code = config .trust_remote_code ,
396
- streaming = dataset_metadata ["streaming" ],
397
- )
403
+ return self ._prepare_speech_to_text_calibration_data (quantization_config , dataset )
404
+ elif is_diffusers_available () and isinstance (self .model , OVDiffusionPipeline ):
405
+ return self ._prepare_diffusion_calibration_data (quantization_config , dataset )
398
406
else :
399
- raise Exception
400
- elif is_diffusers_available () and isinstance (self .model , OVDiffusionPipeline ):
401
- if isinstance (config .dataset , str ):
402
- dataset_name = config .dataset
403
- dataset_metadata = PREDEFINED_DIFFUSION_DATASETS [dataset_name ]
404
-
405
- dataset = self .load_dataset (
406
- dataset_name ,
407
- num_samples = config .num_samples , # This is an upper bound on how many prompts are needed
408
- dataset_split = dataset_metadata ["split" ],
409
- streaming = dataset_metadata ["streaming" ],
410
- )
411
- elif isinstance (config .dataset , list ) and all (isinstance (it , str ) for it in config .dataset ):
412
- dataset = config .dataset
407
+ raise RuntimeError ("Unsupported model type for calibration dataset collection." )
408
+ else :
409
+ # Prepare from dataloader
410
+ # Setting `remove_unused_columns=True` until it is not deprecated
411
+ dataloader = self ._get_calibration_dataloader (
412
+ dataset , batch_size , data_collator , remove_unused_columns = True
413
+ )
414
+ if isinstance (self .model , OVBaseDecoderModel ):
415
+ return self ._prepare_decoder_calibration_data (quantization_config , dataloader )
413
416
else :
414
- raise RuntimeError (
415
- "Please provide dataset as one of the accepted dataset labels or as a list of string prompts."
416
- )
417
-
418
- return self .build_from_dataset (config , dataset )
417
+ # Assuming this is the torch model quantization scenario
418
+ return CalibrationDataset ({"model" : nncf .Dataset (dataloader )})
419
419
420
420
def load_dataset (
421
421
self ,
0 commit comments