@@ -863,6 +863,10 @@ def test_compare_to_transformers(self, model_arch):
863
863
if model_arch in self .REMOTE_CODE_MODELS :
864
864
model_kwargs = {"trust_remote_code" : True }
865
865
866
+ # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
867
+ if model_arch == "gemma2" and is_transformers_version (">=" , "4.45.0" ):
868
+ model_kwargs ["attn_implementation" ] = "sdpa"
869
+
866
870
ov_model = OVModelForCausalLM .from_pretrained (model_id , export = True , ov_config = F32_CONFIG , ** model_kwargs )
867
871
self .assertIsInstance (ov_model .config , PretrainedConfig )
868
872
self .assertTrue (ov_model .use_cache )
@@ -1094,6 +1098,10 @@ def test_beam_search(self, model_arch):
1094
1098
"config" : AutoConfig .from_pretrained (model_id , trust_remote_code = True ),
1095
1099
"trust_remote_code" : True ,
1096
1100
}
1101
+
1102
+ # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
1103
+ if model_arch == "gemma2" and is_transformers_version (">=" , "4.45.0" ):
1104
+ model_kwargs ["attn_implementation" ] = "sdpa"
1097
1105
# Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search
1098
1106
if model_arch in ["qwen" , "chatglm" , "glm4" ]:
1099
1107
return
0 commit comments