44
44
AutoModelForCTC ,
45
45
AutoModelForImageClassification ,
46
46
AutoModelForMaskedLM ,
47
- AutoProcessor ,
48
47
AutoModelForQuestionAnswering ,
49
48
AutoModelForSeq2SeqLM ,
50
49
AutoModelForSequenceClassification ,
51
50
AutoModelForSpeechSeq2Seq ,
52
51
AutoModelForTokenClassification ,
53
52
AutoModelForVision2Seq ,
53
+ AutoProcessor ,
54
54
AutoTokenizer ,
55
55
GenerationConfig ,
56
56
Pix2StructForConditionalGeneration ,
@@ -1868,15 +1868,14 @@ def test_compare_with_and_without_past_key_values(self):
1868
1868
1869
1869
1870
1870
class OVModelForVisualCausalLMIntegrationTest (unittest .TestCase ):
1871
- SUPPORTED_ARCHITECTURES = [
1872
- "llava" ,
1873
- "minicpmv"
1874
- ]
1871
+ SUPPORTED_ARCHITECTURES = ["llava" ]
1875
1872
1876
1873
REMOTE_CODE_MODELS = ["minicpmv" ]
1877
1874
1878
1875
if is_transformers_version (">=" , "4.40.0" ):
1879
1876
SUPPORTED_ARCHITECTURES += ["llava_next" ]
1877
+ if is_transformers_version (">=" , "4.45.0" ):
1878
+ SUPPORTED_ARCHITECTURES += ["minicpmv" ]
1880
1879
TASK = "image-text-to-text"
1881
1880
1882
1881
IMAGE = Image .open (
@@ -1904,20 +1903,24 @@ def test_compare_to_transformers(self, model_arch):
1904
1903
if "llava" in model_arch :
1905
1904
prompt = "<image>\n What is shown in this image?"
1906
1905
elif "minicpmv" in model_arch :
1907
- prompt = "<|im_start|>user\n (<image>./</image>)\n What is shown in this image?<|im_end|>\n <|im_start|>assistant\n "
1906
+ prompt = "<|im_start|>user\n (<image>./</image>)\n What is shown in this image?<|im_end|>\n <|im_start|>assistant\n "
1908
1907
model_id = MODEL_NAMES [model_arch ]
1909
1908
processor = AutoProcessor .from_pretrained (model_id , trust_remote_code = model_arch in self .REMOTE_CODE_MODELS )
1910
- transformers_model = self .get_transformer_model_class (model_arch ).from_pretrained (model_id , trust_remote_code = model_arch in self .REMOTE_CODE_MODELS )
1909
+ transformers_model = self .get_transformer_model_class (model_arch ).from_pretrained (
1910
+ model_id , trust_remote_code = model_arch in self .REMOTE_CODE_MODELS
1911
+ )
1911
1912
inputs = processor (images = [self .IMAGE .resize ((600 , 600 ))], text = [prompt ], return_tensors = "pt" )
1912
- ov_model = OVModelForVisualCausalLM .from_pretrained (model_id , export = True , trust_remote_code = model_arch in self .REMOTE_CODE_MODELS )
1913
+ ov_model = OVModelForVisualCausalLM .from_pretrained (
1914
+ model_id , export = True , trust_remote_code = model_arch in self .REMOTE_CODE_MODELS
1915
+ )
1913
1916
self .assertIsInstance (ov_model , MODEL_TYPE_TO_CLS_MAPPING [ov_model .config .model_type ])
1914
1917
self .assertIsInstance (ov_model .vision_embeddings , OVVisionEmbedding )
1915
1918
self .assertIsInstance (ov_model .language_model , OVModelWithEmbedForCausalLM )
1916
1919
for additional_part in ov_model .additional_parts :
1917
1920
self .assertTrue (hasattr (ov_model , additional_part ))
1918
1921
self .assertIsInstance (getattr (ov_model , additional_part ), MODEL_PARTS_CLS_MAPPING [additional_part ])
1919
1922
self .assertIsInstance (ov_model .config , PretrainedConfig )
1920
- if not "minicpmv" in model_arch :
1923
+ if "minicpmv" not in model_arch :
1921
1924
set_seed (SEED )
1922
1925
with torch .no_grad ():
1923
1926
transformers_outputs = transformers_model (** inputs )
@@ -1939,7 +1942,7 @@ def test_compare_to_transformers(self, model_arch):
1939
1942
set_seed (SEED )
1940
1943
transformers_outputs = transformers_model .generate (** inputs , generation_config = gen_config )
1941
1944
if model_arch == "minicpmv" :
1942
- ov_outputs = ov_outputs [:, inputs ["input_ids" ].shape [1 ]:]
1945
+ ov_outputs = ov_outputs [:, inputs ["input_ids" ].shape [1 ] :]
1943
1946
self .assertTrue (
1944
1947
torch .equal (ov_outputs , transformers_outputs ),
1945
1948
f"generation config : { gen_config } , transformers output { transformers_outputs } , ov_model output { ov_outputs } " ,
@@ -1953,7 +1956,9 @@ def test_compare_to_transformers(self, model_arch):
1953
1956
@parameterized .expand (SUPPORTED_ARCHITECTURES )
1954
1957
def test_generate_utils (self , model_arch ):
1955
1958
model_id = MODEL_NAMES [model_arch ]
1956
- model = OVModelForVisualCausalLM .from_pretrained (model_id , export = True , trust_remote_code = model_arch in self .REMOTE_CODE_MODELS )
1959
+ model = OVModelForVisualCausalLM .from_pretrained (
1960
+ model_id , export = True , trust_remote_code = model_arch in self .REMOTE_CODE_MODELS
1961
+ )
1957
1962
preprocessor = AutoProcessor .from_pretrained (model_id , trust_remote_code = model_arch in self .REMOTE_CODE_MODELS )
1958
1963
if "llava" in model_arch :
1959
1964
question = "<image>\n Describe image"
0 commit comments