@@ -459,57 +459,64 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
459
459
self .assertEqual (0 , num_int8 )
460
460
461
461
def test_ovmodel_load_large_model_with_default_compressed_weights (self ):
462
- with unittest .mock .patch ("transformers.modeling_utils.ModuleUtilsMixin" ) as model_mixin_patch :
463
- model_mixin_patch .num_parameters .return_value = 2e9
462
+ with unittest .mock .patch ("torch.nn.Module.parameters" ) as model_parameters :
463
+ mock_tensor = unittest .mock .Mock ()
464
+ mock_tensor .numel = lambda : 2000000000
465
+ mock_tensor .requires_grad = True
466
+ model_parameters .return_value = [mock_tensor ]
464
467
with unittest .mock .patch ("openvino.runtime.ie_api.Core.read_model" ) as core_patch :
465
468
with unittest .mock .patch ("optimum.exporters.openvino.convert._save_model" ) as save_model_patch :
466
469
_ = OVModelForCausalLM .from_pretrained (
467
470
MODEL_NAMES ["llama" ], export = True , compile = False , use_cache = False
468
471
)
469
- saving_params = {
470
- "model" : unittest .mock .ANY ,
471
- "path" : unittest .mock .ANY ,
472
- "compression_option" : "int8" ,
473
- "compression_ratio" : None ,
474
- }
475
- save_model_patch .aasert_called_with (saving_params )
472
+ save_model_patch .assert_called_with (
473
+ unittest .mock .ANY , unittest .mock .ANY , ov_config = OVConfig (quantization_config = {"bits" : 8 })
474
+ )
476
475
477
476
def test_ovmodel_load_large_model_with_uncompressed_weights (self ):
478
- with unittest .mock .patch ("transformers.modeling_utils.ModuleUtilsMixin" ) as model_mixin_patch :
479
- model_mixin_patch .num_parameters .return_value = 2e9
477
+ with unittest .mock .patch ("torch.nn.Module.parameters" ) as model_parameters :
478
+ mock_tensor = unittest .mock .Mock ()
479
+ mock_tensor .numel = lambda : 2000000000
480
+ mock_tensor .requires_grad = True
481
+ model_parameters .return_value = [mock_tensor ]
480
482
with unittest .mock .patch ("openvino.runtime.ie_api.Core.read_model" ) as core_patch :
481
483
with unittest .mock .patch ("optimum.exporters.openvino.convert._save_model" ) as save_model_patch :
482
484
_ = OVModelForCausalLM .from_pretrained (
483
485
MODEL_NAMES ["llama" ], export = True , load_in_8bit = False , compile = False , use_cache = False
484
486
)
485
- saving_params = {
486
- "model" : unittest .mock .ANY ,
487
- "path" : unittest .mock .ANY ,
488
- "compression_option" : "fp32" ,
489
- "compression_ratio" : None ,
490
- }
491
- save_model_patch .aasert_called_with (saving_params )
487
+ save_model_patch .assert_called_with (
488
+ unittest .mock .ANY , unittest .mock .ANY , ov_config = OVConfig (dtype = "fp32" )
489
+ )
492
490
493
491
def test_ovmodel_load_large_model_with_additional_quantization_config (self ):
494
- with unittest .mock .patch ("transformers.modeling_utils.ModuleUtilsMixin" ) as model_mixin_patch :
495
- model_mixin_patch .num_parameters .return_value = 2e9
492
+ with unittest .mock .patch ("torch.nn.Module.parameters" ) as model_parameters :
493
+ mock_tensor = unittest .mock .Mock ()
494
+ mock_tensor .numel = lambda : 2000000000
495
+ mock_tensor .requires_grad = True
496
496
with unittest .mock .patch ("openvino.runtime.ie_api.Core.read_model" ) as core_patch :
497
497
with unittest .mock .patch ("optimum.exporters.openvino.convert._save_model" ) as save_model_patch :
498
- _ = OVModelForCausalLM .from_pretrained (
499
- MODEL_NAMES ["llama" ],
500
- export = True ,
501
- compile = False ,
502
- use_cache = False ,
503
- quantization_config = OVWeightQuantizationConfig (bits = 4 , sym = True , group_size = - 1 , ratio = 0.8 ),
504
- )
505
- # quantization will be performed later, using load_model
506
- saving_params = {
507
- "model" : unittest .mock .ANY ,
508
- "path" : unittest .mock .ANY ,
509
- "compression_option" : "fp32" ,
510
- "compression_ratio" : None ,
511
- }
512
- save_model_patch .aasert_called_with (saving_params )
498
+ with unittest .mock .patch ("nncf.compress_weights" ) as compress_weights_patch :
499
+ _ = OVModelForCausalLM .from_pretrained (
500
+ MODEL_NAMES ["llama" ],
501
+ export = True ,
502
+ compile = False ,
503
+ use_cache = False ,
504
+ quantization_config = OVWeightQuantizationConfig (bits = 4 , sym = True , group_size = - 1 , ratio = 0.8 ),
505
+ )
506
+ # quantization will be performed later, using load_model
507
+ save_model_patch .assert_called_with (
508
+ unittest .mock .ANY , unittest .mock .ANY , ov_config = OVConfig (dtype = "fp32" )
509
+ )
510
+ compression_params = {
511
+ "mode" : nncf .CompressWeightsMode .INT4_SYM ,
512
+ "ratio" : 0.8 ,
513
+ "group_size" : - 1 ,
514
+ "all_layers" : None ,
515
+ "sensitivity_metric" : None ,
516
+ "dataset" : None ,
517
+ "ignored_scope" : None ,
518
+ }
519
+ compress_weights_patch .assert_called_with (unittest .mock .ANY , ** compression_params )
513
520
514
521
515
522
class OVQuantizerQATest (unittest .TestCase ):
0 commit comments