23
23
from functools import partial
24
24
from typing import Union
25
25
26
+ import openvino as ov
26
27
import pytest
27
28
import evaluate
28
29
import numpy as np
82
83
MODEL_NAMES ,
83
84
get_num_quantized_nodes ,
84
85
_ARCHITECTURES_TO_EXPECTED_INT8 ,
85
- compare_num_quantized_nodes_per_model ,
86
+ check_compression_state_per_model ,
86
87
)
87
88
88
89
_TASK_TO_DATASET = {
@@ -713,20 +714,18 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
713
714
self .assertEqual (model ._openvino_config .dtype , "int8" )
714
715
715
716
if model .export_feature .startswith ("text2text-generation" ):
716
- models = [model .encoder . model , model .decoder . model ]
717
+ models = [model .encoder , model .decoder ]
717
718
if model .decoder_with_past is not None :
718
- models .append (model .decoder_with_past . model )
719
+ models .append (model .decoder_with_past )
719
720
elif model .export_feature == "text-to-image" :
720
- models = [model .unet .model , model .vae_encoder .model , model .vae_decoder .model ]
721
- models .append (
722
- model .text_encoder .model if model_type in ["stable-diffusion" , "sana" ] else model .text_encoder_2 .model
723
- )
721
+ models = [model .unet , model .vae_encoder , model .vae_decoder ]
722
+ models .append (model .text_encoder if model_type in ["stable-diffusion" , "sana" ] else model .text_encoder_2 )
724
723
elif model_type == "open-clip" :
725
724
models = [model .text_model , model .visual_model ]
726
725
elif model .export_feature == "image-text-to-text" :
727
726
models = list (model .submodels .values ())
728
727
else :
729
- models = [model . model ]
728
+ models = [model ]
730
729
731
730
if model_type == "open-clip" :
732
731
pytest .skip (reason = "ticket 161043" )
@@ -736,10 +735,8 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
736
735
check_optimization_not_applicable_to_optimized_model (model , quantization_config = {"bits" : 8 })
737
736
738
737
expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8 [model_type ]
739
- for i , model in enumerate (models ):
740
- _ , num_weight_nodes = get_num_quantized_nodes (model )
741
- self .assertEqual (expected_ov_int8 [i ], num_weight_nodes ["int8" ])
742
- self .assertFalse (model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
738
+ expected_ov_int8 = [{"int8" : it } for it in expected_ov_int8 ]
739
+ check_compression_state_per_model (self , models , expected_ov_int8 )
743
740
744
741
@parameterized .expand (SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION )
745
742
def test_ovmodel_hybrid_quantization (self , model_cls , model_type , expected_fake_nodes , expected_int8_nodes ):
@@ -841,9 +838,7 @@ def test_ovmodel_4bit_auto_compression_with_config(
841
838
submodels = [model .model ]
842
839
elif isinstance (model , OVModelForVisualCausalLM ):
843
840
submodels = list (model .submodels .values ())
844
- compare_num_quantized_nodes_per_model (self , submodels , expected_num_weight_nodes_per_model )
845
- for submodel in submodels :
846
- self .assertFalse (submodel .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
841
+ check_compression_state_per_model (self , submodels , expected_num_weight_nodes_per_model )
847
842
848
843
model .save_pretrained (tmp_dir )
849
844
# At the moment the first model in the list is the only one we apply data-aware compression to
@@ -869,36 +864,34 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty
869
864
870
865
expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8 [model_type ][0 ]
871
866
_ , num_weight_nodes = get_num_quantized_nodes (model )
872
- self .assertEqual (expected_ov_int8 , num_weight_nodes ["int8" ])
873
- self .assertFalse (model .model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
867
+ check_compression_state_per_model (self , [model .model ], [{"int8" : expected_ov_int8 }])
874
868
875
869
@parameterized .expand (SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION )
876
870
def test_ovmodel_load_with_uncompressed_weights (self , model_cls , model_type , trust_remote_code ):
877
871
model = model_cls .from_pretrained (
878
872
MODEL_NAMES [model_type ], export = True , load_in_8bit = False , trust_remote_code = trust_remote_code
879
873
)
880
874
if model .export_feature .startswith ("text2text-generation" ):
881
- models = [model .encoder . model , model .decoder . model ]
875
+ models = [model .encoder , model .decoder ]
882
876
if model .decoder_with_past is not None :
883
- models .append (model .decoder_with_past . model )
877
+ models .append (model .decoder_with_past )
884
878
elif model .export_feature == "text-to-image" :
885
- models = [model .unet .model , model .vae_encoder .model , model .vae_decoder .model ]
886
- models .append (
887
- model .text_encoder .model if model_type in ["stable-diffusion" , "sana" ] else model .text_encoder_2 .model
888
- )
879
+ models = [model .unet , model .vae_encoder , model .vae_decoder ]
880
+ models .append (model .text_encoder if model_type in ["stable-diffusion" , "sana" ] else model .text_encoder_2 )
889
881
elif model_type == "open-clip" :
890
882
models = [model .text_model , model .visual_model ]
891
883
elif model .export_feature == "image-text-to-text" :
892
884
models = list (model .submodels .values ())
893
885
else :
894
- models = [model . model ]
886
+ models = [model ]
895
887
896
888
for i , submodel in enumerate (models ):
897
- _ , num_weight_nodes = get_num_quantized_nodes (submodel )
889
+ ov_model = submodel if isinstance (submodel , ov .Model ) else submodel .model
890
+ _ , num_weight_nodes = get_num_quantized_nodes (ov_model )
898
891
self .assertEqual (0 , num_weight_nodes ["int8" ])
899
892
if "text-generation" in model .export_feature or ("image-text-to-text" in model .export_feature and i == 0 ):
900
- self .assertTrue (submodel .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
901
- kv_cache_precision = submodel .get_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]).value
893
+ self .assertTrue (ov_model .has_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]))
894
+ kv_cache_precision = ov_model .get_rt_info (["runtime_options" , "KV_CACHE_PRECISION" ]).value
902
895
self .assertTrue (kv_cache_precision == "f16" )
903
896
904
897
def test_ovmodel_load_large_model_with_default_compressed_weights (self ):
@@ -1013,7 +1006,7 @@ def test_ovmodel_4bit_dynamic_with_config(
1013
1006
submodels = [model .model ]
1014
1007
elif isinstance (model , OVModelForVisualCausalLM ):
1015
1008
submodels = list (model .submodels .values ())
1016
- compare_num_quantized_nodes_per_model (self , submodels , expected_num_weight_nodes_per_model )
1009
+ check_compression_state_per_model (self , submodels , expected_num_weight_nodes_per_model )
1017
1010
1018
1011
model .save_pretrained (tmp_dir )
1019
1012
openvino_config = OVConfig .from_pretrained (tmp_dir )
0 commit comments