@@ -173,19 +173,32 @@ def build_from_dataset(
173
173
if is_diffusers_available ():
174
174
from optimum .intel .openvino .modeling_diffusion import OVDiffusionPipeline
175
175
176
- dataloader = self ._get_calibration_dataloader (dataset , batch_size , data_collator , remove_unused_columns )
177
-
178
- if isinstance (self .model , OVBaseDecoderModel ):
179
- return self ._prepare_decoder_calibration_data (quantization_config , dataloader )
180
- elif isinstance (self .model , OVModelForVisualCausalLM ):
181
- return self ._prepare_visual_causal_lm_calibration_data (quantization_config , dataloader )
182
- elif isinstance (self .model , _OVModelForWhisper ):
183
- return self ._prepare_speech_to_text_calibration_data (quantization_config , dataloader )
184
- elif is_diffusers_available () and isinstance (self .model , OVDiffusionPipeline ):
185
- return self ._prepare_diffusion_calibration_data (quantization_config , dataloader )
176
+ if isinstance (self .model , (OVModelForVisualCausalLM , _OVModelForWhisper )) or (
177
+ is_diffusers_available () and isinstance (self .model , OVDiffusionPipeline )
178
+ ):
179
+ # Prepare from raw dataset avoiding dataloader creation
180
+ if batch_size != 1 or data_collator is not None or remove_unused_columns :
181
+ logger .warning (
182
+ "`batch_size`, `data_collator` and `remove_unused_columns` are not supported for this type of model."
183
+ )
184
+
185
+ if isinstance (self .model , OVModelForVisualCausalLM ):
186
+ return self ._prepare_visual_causal_lm_calibration_data (quantization_config , dataset )
187
+ elif isinstance (self .model , _OVModelForWhisper ):
188
+ return self ._prepare_speech_to_text_calibration_data (quantization_config , dataset )
189
+ elif is_diffusers_available () and isinstance (self .model , OVDiffusionPipeline ):
190
+ return self ._prepare_diffusion_calibration_data (quantization_config , dataset )
191
+ else :
192
+ # TODO
193
+ raise Exception ()
186
194
else :
187
- # Torch model quantization scenario
188
- return {"model" : nncf .Dataset (dataloader )}
195
+ # Prepare from dataloader
196
+ dataloader = self ._get_calibration_dataloader (dataset , batch_size , data_collator , remove_unused_columns )
197
+ if isinstance (self .model , OVBaseDecoderModel ):
198
+ return self ._prepare_decoder_calibration_data (quantization_config , dataloader )
199
+ else :
200
+ # Torch model quantization scenario
201
+ return {"model" : nncf .Dataset (dataloader )}
189
202
190
203
def build_from_dataset_name (
191
204
self ,
@@ -257,6 +270,9 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> Di
257
270
if is_diffusers_available ():
258
271
from optimum .intel .openvino .modeling_diffusion import OVDiffusionPipeline
259
272
273
+ if config .dataset is None :
274
+ raise ValueError ("Please provide a dataset for calibration." )
275
+
260
276
if isinstance (self .model , OVModelForCausalLM ):
261
277
return self ._prepare_causal_lm_calibration_data (config )
262
278
elif isinstance (self .model , (OVModelForVisualCausalLM , _OVModelForWhisper )):
@@ -266,108 +282,46 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> Di
266
282
"model id, or a path to a directory containing all the required configuration files."
267
283
)
268
284
269
- trc = config .trust_remote_code
270
- processor = AutoProcessor .from_pretrained (config .processor , trust_remote_code = trc )
271
285
if isinstance (self .model , OVModelForVisualCausalLM ):
272
- try :
273
- tokenizer = AutoTokenizer .from_pretrained (config .tokenizer , trust_remote_code = trc )
274
- tokenizer_error = None
275
- except Exception as tokenizer_error : # noqa: F841
276
- tokenizer = None
277
-
278
286
dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS [config .dataset ]
279
-
280
- def preprocess_function (item ):
281
- inputs_metadata = dataset_metadata ["inputs" ]
282
- instruction = item [inputs_metadata ["instruction" ]]
283
- image_url = item [inputs_metadata ["image_url" ]]
284
-
285
- image = Image .open (requests .get (image_url , stream = True ).raw )
286
-
287
- try :
288
- inputs = self .model .preprocess_inputs (
289
- text = instruction ,
290
- image = image ,
291
- processor = processor ,
292
- tokenizer = tokenizer ,
293
- config = self .model .config ,
294
- )
295
- # Remove batch dimension
296
- for key in inputs .keys ():
297
- inputs [key ] = inputs [key ][0 ]
298
- except ValueError as value_error :
299
- if "Tokenizer is required." in str (value_error ) and tokenizer_error is not None :
300
- raise tokenizer_error
301
- raise value_error
302
-
303
- return inputs
304
-
305
287
return self .build_from_dataset_name (
306
288
config ,
307
289
dataset_metadata ["id" ],
308
290
num_samples = config .num_samples ,
309
291
dataset_split = dataset_metadata ["split" ],
310
- preprocess_function = preprocess_function ,
311
- preprocess_batch = False ,
312
- trust_remote_code = trc ,
292
+ trust_remote_code = config .trust_remote_code ,
313
293
)
314
294
elif isinstance (self .model , _OVModelForWhisper ):
315
295
dataset_metadata = PREDEFINED_SPEECH_TO_TEXT_DATASETS [config .dataset ]
316
-
317
- def preprocess_function (item ):
318
- audio = item ["audio" ]["array" ]
319
- sampling_rate = item ["audio" ]["sampling_rate" ]
320
- inputs = processor (audio , sampling_rate = sampling_rate , return_tensors = "pt" )
321
- # This way key "audio" in the original dict will be overridden and not cause problems
322
- return {"audio" : inputs .input_features [0 ]}
323
-
324
296
return self .build_from_dataset_name (
325
297
config ,
326
298
dataset_metadata ["id" ],
327
299
num_samples = config .num_samples , # This is an upper bound on how many audios are needed
328
300
dataset_config_name = dataset_metadata ["name" ],
329
301
dataset_split = dataset_metadata ["split" ],
330
- preprocess_function = preprocess_function ,
331
- preprocess_batch = False ,
332
- trust_remote_code = trc ,
302
+ trust_remote_code = config .trust_remote_code ,
333
303
streaming = dataset_metadata ["streaming" ],
334
304
)
335
305
else :
336
306
raise Exception
337
307
elif is_diffusers_available () and isinstance (self .model , OVDiffusionPipeline ):
338
- dataset = config .dataset
339
-
340
- dataset_metadata = None
341
- if isinstance (dataset , str ):
342
- dataset_name = dataset
308
+ if isinstance (config .dataset , str ):
309
+ dataset_name = config .dataset
343
310
dataset_metadata = PREDEFINED_DIFFUSION_DATASETS [dataset_name ]
344
311
345
- def preprocess_function (item ):
346
- return {"prompt" : item [dataset_metadata ["prompt_column_name" ]]}
347
-
348
312
dataset = self .load_dataset (
349
313
dataset_name ,
350
314
num_samples = config .num_samples , # This is an upper bound on how many prompts are needed
351
315
dataset_split = dataset_metadata ["split" ],
352
- preprocess_function = preprocess_function ,
353
- preprocess_batch = False ,
354
316
streaming = dataset_metadata ["streaming" ],
355
317
)
356
- elif not (isinstance (dataset , list ) and all (isinstance (it , str ) for it in dataset )):
357
- raise Exception
358
-
359
- def collate_fn (features ):
360
- first = features [0 ]
361
- if isinstance (first , dict ) and dataset_metadata is not None :
362
- # List of dicts case
363
- batch = [it ["prompt" ] for it in features ]
364
- else :
365
- # List of strings case
366
- # TODO: ?
367
- batch = features
368
- return batch
318
+ elif isinstance (config .dataset , list ) and all (isinstance (it , str ) for it in config .dataset ):
319
+ dataset = config .dataset
320
+ else :
321
+ # TODO
322
+ raise Exception ()
369
323
370
- return self .build_from_dataset (config , dataset , data_collator = collate_fn )
324
+ return self .build_from_dataset (config , dataset )
371
325
372
326
def load_dataset (
373
327
self ,
@@ -514,11 +468,33 @@ def _prepare_causal_lm_calibration_data(
514
468
return {"model" : calibration_dataset }
515
469
516
470
def _prepare_visual_causal_lm_calibration_data (
517
- self , quantization_config : OVQuantizationConfigBase , dataloader : OVDataLoader
471
+ self , config : OVQuantizationConfigBase , dataset : "Dataset"
518
472
) -> Dict [str , nncf .Dataset ]:
473
+ processor = AutoProcessor .from_pretrained (config .processor , trust_remote_code = config .trust_remote_code )
474
+ try :
475
+ tokenizer = AutoTokenizer .from_pretrained (config .tokenizer , trust_remote_code = config .trust_remote_code )
476
+ tokenizer_error = None
477
+ except Exception as tokenizer_error : # noqa: F841
478
+ tokenizer = None
479
+
480
+ dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS [config .dataset ]
481
+
519
482
calibration_data = []
520
- num_samples = quantization_config .num_samples or 32
521
- for inputs in tqdm (dataloader , desc = "Collecting calibration dataset" , total = num_samples ):
483
+ num_samples = config .num_samples or 32
484
+ for item in tqdm (dataset , desc = "Collecting calibration dataset" , total = num_samples ):
485
+ instruction = item [dataset_metadata ["inputs" ]["instruction" ]]
486
+ image_url = item [dataset_metadata ["inputs" ]["image_url" ]]
487
+ image = Image .open (requests .get (image_url , stream = True ).raw ).convert ("RGB" )
488
+
489
+ try :
490
+ inputs = self .model .preprocess_inputs (
491
+ text = instruction , image = image , processor = processor , tokenizer = tokenizer , config = self .model .config
492
+ )
493
+ except ValueError as value_error :
494
+ if "Tokenizer is required." in str (value_error ) and tokenizer_error is not None :
495
+ raise tokenizer_error
496
+ raise value_error
497
+
522
498
input_ids = inputs .get ("input_ids" )
523
499
position_ids = torch .arange (input_ids .size (1 )).unsqueeze (0 ).to (input_ids .device )
524
500
@@ -542,7 +518,7 @@ def _prepare_visual_causal_lm_calibration_data(
542
518
return {"lm_model" : nncf .Dataset (calibration_data )}
543
519
544
520
def _prepare_speech_to_text_calibration_data (
545
- self , quantization_config : OVQuantizationConfigBase , dataloader : OVDataLoader
521
+ self , config : OVQuantizationConfigBase , dataset : "Dataset"
546
522
) -> Dict [str , nncf .Dataset ]:
547
523
from optimum .intel .openvino .modeling_seq2seq import OVDecoder , OVEncoder
548
524
@@ -559,12 +535,17 @@ def _prepare_speech_to_text_calibration_data(
559
535
)
560
536
561
537
try :
562
- # Download audio inputs beforehand to avoid possible connection issues
563
- num_samples = quantization_config .num_samples or 32
564
- audio_inputs = list (tqdm (dataloader , desc = "Downloading audio inputs" , total = num_samples ))
538
+ processor = AutoProcessor .from_pretrained (config .processor , trust_remote_code = config .trust_remote_code )
565
539
566
- for inputs in tqdm (audio_inputs , desc = "Collecting calibration data" ):
567
- self .model .generate (inputs ["audio" ])
540
+ # Download audio inputs beforehand to avoid possible connection issues
541
+ num_samples = config .num_samples or 32
542
+ downloaded_dataset = list (tqdm (dataset , desc = "Downloading audio inputs" , total = num_samples ))
543
+
544
+ for item in tqdm (downloaded_dataset , desc = "Collecting calibration data" ):
545
+ audio = item ["audio" ]["array" ]
546
+ sampling_rate = item ["audio" ]["sampling_rate" ]
547
+ input_features = processor (audio , sampling_rate = sampling_rate , return_tensors = "pt" ).input_features
548
+ self .model .generate (input_features )
568
549
finally :
569
550
for model in models .values ():
570
551
model .request = model .request .request
@@ -575,7 +556,7 @@ def _prepare_speech_to_text_calibration_data(
575
556
return calibration_data
576
557
577
558
def _prepare_diffusion_calibration_data (
578
- self , quantization_config : OVQuantizationConfigBase , dataloader : OVDataLoader
559
+ self , config : OVQuantizationConfigBase , dataset : "Dataset"
579
560
) -> Dict [str , nncf .Dataset ]:
580
561
self .model .compile ()
581
562
@@ -585,16 +566,18 @@ def _prepare_diffusion_calibration_data(
585
566
size = diffuser .config .get ("sample_size" , 64 ) * self .model .vae_scale_factor
586
567
height , width = 2 * (min (size , 512 ),)
587
568
588
- num_samples = quantization_config .num_samples or 200
569
+ num_samples = config .num_samples or 200
589
570
calibration_data = []
590
571
try :
591
572
diffuser .request = InferRequestWrapper (diffuser .request , calibration_data )
592
573
593
- for inputs in tqdm (dataloader , desc = "Collecting calibration data" ):
594
- if isinstance (inputs , dict ):
595
- self .model (** inputs , height = height , width = width )
596
- else :
597
- self .model (inputs , height = height , width = width )
574
+ for item in tqdm (dataset , desc = "Collecting calibration data" ):
575
+ prompt = (
576
+ item [PREDEFINED_DIFFUSION_DATASETS [config .dataset ]["prompt_column_name" ]]
577
+ if isinstance (item , dict )
578
+ else item
579
+ )
580
+ self .model (prompt , height = height , width = width )
598
581
if len (calibration_data ) >= num_samples :
599
582
break
600
583
finally :
0 commit comments