@@ -2141,7 +2141,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
2141
2141
SUPPORTED_ARCHITECTURES += ["maira2" ]
2142
2142
2143
2143
if is_transformers_version (">=" , "4.49.0" ):
2144
- SUPPORTED_ARCHITECTURES += ["qwen2_5_vl" ]
2144
+ SUPPORTED_ARCHITECTURES += ["qwen2_5_vl" , "got_ocr2" ]
2145
2145
SUPPORT_VIDEO .append ("qwen2_5_vl" )
2146
2146
TASK = "image-text-to-text"
2147
2147
REMOTE_CODE_MODELS = ["internvl2" , "minicpmv" , "nanollava" , "phi3_v" , "maira2" ]
@@ -2154,7 +2154,13 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
2154
2154
)
2155
2155
2156
2156
def get_transformer_model_class (self , model_arch ):
2157
- if is_transformers_version (">=" , "4.46" ) and model_arch in ["llava" , "llava_next" , "qwen2_vl" , "qwen2_5_vl" ]:
2157
+ if is_transformers_version (">=" , "4.46" ) and model_arch in [
2158
+ "llava" ,
2159
+ "llava_next" ,
2160
+ "qwen2_vl" ,
2161
+ "qwen2_5_vl" ,
2162
+ "got_ocr2" ,
2163
+ ]:
2158
2164
from transformers import AutoModelForImageTextToText
2159
2165
2160
2166
return AutoModelForImageTextToText
@@ -2339,14 +2345,16 @@ def test_generate_utils(self, model_arch):
2339
2345
outputs = tokenizer .batch_decode (outputs [:, inputs ["input_ids" ].shape [1 ] :], skip_special_tokens = True )
2340
2346
self .assertIsInstance (outputs [0 ], str )
2341
2347
2342
- # No input image case
2343
- question = "Hi, how are you?"
2344
- inputs = model .preprocess_inputs (** preprocessors , text = question , image = None )
2345
- outputs = model .generate (** inputs , max_new_tokens = 10 )
2346
- # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
2347
- outputs = outputs [:, inputs ["input_ids" ].shape [1 ] :]
2348
- outputs = tokenizer .batch_decode (outputs , skip_special_tokens = True )
2349
- self .assertIsInstance (outputs [0 ], str )
2348
+ # GOT-OCR2 does not support text-only input
2349
+ if model_arch != "got_ocr2" :
2350
+ # No input image case
2351
+ question = "Hi, how are you?"
2352
+ inputs = model .preprocess_inputs (** preprocessors , text = question , image = None )
2353
+ outputs = model .generate (** inputs , max_new_tokens = 10 )
2354
+ # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
2355
+ outputs = outputs [:, inputs ["input_ids" ].shape [1 ] :]
2356
+ outputs = tokenizer .batch_decode (outputs , skip_special_tokens = True )
2357
+ self .assertIsInstance (outputs [0 ], str )
2350
2358
2351
2359
# video loader helper only available for transformers >= 4.49
2352
2360
if model_arch in self .SUPPORT_VIDEO and is_transformers_version (">=" , "4.49" ):
0 commit comments