@@ -450,7 +450,7 @@ class OVWeightCompressionTest(unittest.TestCase):
450
450
num_samples = 1 ,
451
451
processor = MODEL_NAMES ["llava_next" ],
452
452
),
453
- [{"int8" : 6 , "int4" : 24 }, {"int8" : 9 }, {"int8" : 1 }],
453
+ [{"int8" : 6 , "int4" : 24 }, {"int8" : 1 }, {"int8" : 9 }],
454
454
),
455
455
(
456
456
OVModelForVisualCausalLM ,
@@ -467,7 +467,7 @@ class OVWeightCompressionTest(unittest.TestCase):
467
467
tokenizer = MODEL_NAMES ["nanollava" ],
468
468
trust_remote_code = True ,
469
469
),
470
- [{"int8" : 16 , "int4" : 14 }, {"int8" : 15 }, {"int8" : 1 }],
470
+ [{"int8" : 16 , "int4" : 14 }, {"int8" : 1 }, {"int8" : 15 }],
471
471
),
472
472
]
473
473
)
@@ -489,7 +489,7 @@ class OVWeightCompressionTest(unittest.TestCase):
489
489
processor = MODEL_NAMES ["minicpmv" ],
490
490
trust_remote_code = True ,
491
491
),
492
- [{"int8" : 8 , "int4" : 22 }, {"int8" : 26 }, {"int8" : 1 }, {"int8" : 6 }],
492
+ [{"int8" : 8 , "int4" : 22 }, {"int8" : 1 }, {"int8" : 26 }, {"int8" : 6 }],
493
493
),
494
494
(
495
495
OVModelForVisualCausalLM ,
@@ -504,7 +504,7 @@ class OVWeightCompressionTest(unittest.TestCase):
504
504
num_samples = 1 ,
505
505
trust_remote_code = True ,
506
506
),
507
- [{"int8" : 8 , "int4" : 22 }, {"int8" : 11 }, {"int8" : 1 }],
507
+ [{"int8" : 8 , "int4" : 22 }, {"int8" : 1 }, {"int8" : 11 }],
508
508
),
509
509
(
510
510
OVModelForVisualCausalLM ,
@@ -519,7 +519,7 @@ class OVWeightCompressionTest(unittest.TestCase):
519
519
num_samples = 1 ,
520
520
trust_remote_code = True ,
521
521
),
522
- [{"int8" : 4 , "int4" : 14 }, {"int8" : 7 }, {"int8" : 1 }, {"int8" : 2 }],
522
+ [{"int8" : 4 , "int4" : 14 }, {"int8" : 1 }, {"int8" : 7 }, {"int8" : 2 }],
523
523
),
524
524
(
525
525
OVModelForVisualCausalLM ,
@@ -610,6 +610,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i
610
610
if isinstance (v , Enum ):
611
611
original_config_as_dict [k ] = v .value
612
612
self .assertEqual (original_config_as_dict , loaded_config .quantization_config .to_dict ())
613
+ self .assertFalse (model .model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
613
614
614
615
@parameterized .expand (SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS )
615
616
def test_ovmodel_8bit_weight_compression (self , model_cls , model_name , expected_pt_int8 , expected_ov_int8 ):
@@ -636,6 +637,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p
636
637
# Verify that the configuration is correctly saved and loaded
637
638
loaded_config = OVConfig .from_pretrained (tmp_dir )
638
639
self .assertEqual (OVWeightQuantizationConfig ().to_dict (), loaded_config .quantization_config .to_dict ())
640
+ self .assertFalse (model .model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
639
641
640
642
@parameterized .expand (SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS )
641
643
def test_ovmodel_4bit_weight_compression (self , model_cls , model_name , expected_int8_nodes , expected_int4_nodes ):
@@ -663,6 +665,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i
663
665
# Verify that the configuration is correctly saved and loaded
664
666
loaded_config = OVConfig .from_pretrained (tmp_dir )
665
667
self .assertEqual (ov_config .quantization_config .to_dict (), loaded_config .quantization_config .to_dict ())
668
+ self .assertFalse (model .model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
666
669
667
670
@parameterized .expand (SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS )
668
671
def test_ovmodel_8bit_weight_compression_stateful (self , model_cls , model_name , expected_pt_int8 , expected_ov_int8 ):
@@ -688,6 +691,7 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, e
688
691
# Verify that the configuration is correctly saved and loaded
689
692
loaded_config = OVConfig .from_pretrained (tmp_dir )
690
693
self .assertEqual (OVWeightQuantizationConfig ().to_dict (), loaded_config .quantization_config .to_dict ())
694
+ self .assertFalse (model .model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
691
695
692
696
@parameterized .expand (SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION )
693
697
def test_ovmodel_load_with_compressed_weights (self , model_cls , model_type , trust_remote_code ):
@@ -709,19 +713,20 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
709
713
self .assertEqual (model ._openvino_config .dtype , "int8" )
710
714
711
715
if model .export_feature .startswith ("text2text-generation" ):
712
- models = [model .encoder , model .decoder ]
716
+ models = [model .encoder . model , model .decoder . model ]
713
717
if model .decoder_with_past is not None :
714
- models .append (model .decoder_with_past )
718
+ models .append (model .decoder_with_past . model )
715
719
elif model .export_feature == "text-to-image" :
716
- models = [model .unet , model .vae_encoder , model .vae_decoder ]
717
- models .append (model .text_encoder if model_type == "stable-diffusion" else model .text_encoder_2 )
720
+ models = [model .unet .model , model .vae_encoder .model , model .vae_decoder .model ]
721
+ models .append (
722
+ model .text_encoder .model if model_type in ["stable-diffusion" , "sana" ] else model .text_encoder_2 .model
723
+ )
718
724
elif model_type == "open-clip" :
719
725
models = [model .text_model , model .visual_model ]
720
726
elif model .export_feature == "image-text-to-text" :
721
- models = [model .lm_model , model .vision_embeddings_model , model .text_embeddings_model ]
722
- models += [getattr (model , part ) for part in model .additional_parts ]
727
+ models = list (model .submodels .values ())
723
728
else :
724
- models = [model ]
729
+ models = [model . model ]
725
730
726
731
if model_type == "open-clip" :
727
732
pytest .skip (reason = "ticket 161043" )
@@ -734,6 +739,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
734
739
for i , model in enumerate (models ):
735
740
_ , num_weight_nodes = get_num_quantized_nodes (model )
736
741
self .assertEqual (expected_ov_int8 [i ], num_weight_nodes ["int8" ])
742
+ self .assertFalse (model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
737
743
738
744
@parameterized .expand (SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION )
739
745
def test_ovmodel_hybrid_quantization (self , model_cls , model_type , expected_fake_nodes , expected_int8_nodes ):
@@ -834,9 +840,10 @@ def test_ovmodel_4bit_auto_compression_with_config(
834
840
if isinstance (model , OVModelForCausalLM ):
835
841
submodels = [model .model ]
836
842
elif isinstance (model , OVModelForVisualCausalLM ):
837
- submodels = [model .lm_model , model .vision_embeddings_model , model .text_embeddings_model ]
838
- submodels += [getattr (model , part ) for part in model .additional_parts ]
843
+ submodels = list (model .submodels .values ())
839
844
compare_num_quantized_nodes_per_model (self , submodels , expected_num_weight_nodes_per_model )
845
+ for submodel in submodels :
846
+ self .assertFalse (submodel .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
840
847
841
848
model .save_pretrained (tmp_dir )
842
849
# At the moment the first model in the list is the only one we apply data-aware compression to
@@ -863,35 +870,45 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty
863
870
expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8 [model_type ][0 ]
864
871
_ , num_weight_nodes = get_num_quantized_nodes (model )
865
872
self .assertEqual (expected_ov_int8 , num_weight_nodes ["int8" ])
873
+ self .assertFalse (model .model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
866
874
867
875
@parameterized .expand (SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION )
868
876
def test_ovmodel_load_with_uncompressed_weights (self , model_cls , model_type , trust_remote_code ):
869
877
model = model_cls .from_pretrained (
870
878
MODEL_NAMES [model_type ], export = True , load_in_8bit = False , trust_remote_code = trust_remote_code
871
879
)
872
880
if model .export_feature .startswith ("text2text-generation" ):
873
- models = [model .encoder , model .decoder ]
881
+ models = [model .encoder . model , model .decoder . model ]
874
882
if model .decoder_with_past is not None :
875
- models .append (model .decoder_with_past )
883
+ models .append (model .decoder_with_past . model )
876
884
elif model .export_feature == "text-to-image" :
877
- models = [model .unet , model .vae_encoder , model .vae_decoder ]
878
- models .append (model .text_encoder if model_type == "stable-diffusion" else model .text_encoder_2 )
885
+ models = [model .unet .model , model .vae_encoder .model , model .vae_decoder .model ]
886
+ models .append (
887
+ model .text_encoder .model if model_type in ["stable-diffusion" , "sana" ] else model .text_encoder_2 .model
888
+ )
879
889
elif model_type == "open-clip" :
880
890
models = [model .text_model , model .visual_model ]
881
891
elif model .export_feature == "image-text-to-text" :
882
- models = [model .lm_model , model .vision_embeddings_model , model .text_embeddings_model ]
883
- models += [getattr (model , part ) for part in model .additional_parts ]
892
+ models = list (model .submodels .values ())
884
893
else :
885
- models = [model ]
894
+ models = [model . model ]
886
895
887
- for i , model in enumerate (models ):
888
- _ , num_weight_nodes = get_num_quantized_nodes (model )
896
+ for i , submodel in enumerate (models ):
897
+ _ , num_weight_nodes = get_num_quantized_nodes (submodel )
889
898
self .assertEqual (0 , num_weight_nodes ["int8" ])
899
+ if "text-generation" in model .export_feature or ("image-text-to-text" in model .export_feature and i == 0 ):
900
+ self .assertTrue (submodel .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
901
+ kv_cache_precision = submodel .get_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]).value
902
+ self .assertTrue (kv_cache_precision == "f16" )
890
903
891
904
def test_ovmodel_load_large_model_with_default_compressed_weights (self ):
905
+ compressed_model_mock_obj = unittest .mock .Mock ()
906
+ compressed_model_mock_obj .has_rt_info .return_value = False
907
+
892
908
def main_export_in_stacktrace (* args , ** kwargs ):
893
909
# Compression was called from `main_export`
894
910
self .assertTrue (inspect .stack ()[5 ].function == "main_export" )
911
+ return compressed_model_mock_obj
895
912
896
913
with unittest .mock .patch (
897
914
"openvino.runtime.op.Constant.shape" , new_callable = unittest .mock .PropertyMock
@@ -929,15 +946,20 @@ def test_ovmodel_load_large_model_with_uncompressed_weights(self):
929
946
) as ov_constant_shape :
930
947
ov_constant_shape .return_value = (2000000000 ,)
931
948
with unittest .mock .patch ("nncf.compress_weights" ) as compress_weights_patch :
932
- _ = OVModelForCausalLM .from_pretrained (
949
+ model = OVModelForCausalLM .from_pretrained (
933
950
MODEL_NAMES ["llama" ], export = True , load_in_8bit = False , compile = False , use_cache = False
934
951
)
935
952
compress_weights_patch .assert_not_called ()
953
+ self .assertTrue (model .model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
936
954
937
955
def test_ovmodel_load_large_model_with_additional_quantization_config (self ):
956
+ compressed_model_mock_obj = unittest .mock .Mock ()
957
+ compressed_model_mock_obj .has_rt_info .return_value = False
958
+
938
959
def main_export_not_in_stacktrace (* args , ** kwargs ):
939
960
# Compression was not called from `main_export`
940
961
self .assertTrue (all (frame_info .function != "main_export" for frame_info in inspect .stack ()))
962
+ return compressed_model_mock_obj
941
963
942
964
with unittest .mock .patch (
943
965
"openvino.runtime.op.Constant.shape" , new_callable = unittest .mock .PropertyMock
@@ -990,8 +1012,7 @@ def test_ovmodel_4bit_dynamic_with_config(
990
1012
if isinstance (model , OVModelForCausalLM ):
991
1013
submodels = [model .model ]
992
1014
elif isinstance (model , OVModelForVisualCausalLM ):
993
- submodels = [model .lm_model , model .vision_embeddings_model , model .text_embeddings_model ]
994
- submodels += [getattr (model , part ) for part in model .additional_parts ]
1015
+ submodels = list (model .submodels .values ())
995
1016
compare_num_quantized_nodes_per_model (self , submodels , expected_num_weight_nodes_per_model )
996
1017
997
1018
model .save_pretrained (tmp_dir )
0 commit comments