From bbec36a7d1ba70b53c3d5f7d59cc3eeca5a71074 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 29 Jan 2025 16:54:07 +0400
Subject: [PATCH 1/4] align nanollava input with original model

---
 optimum/intel/openvino/modeling_visual_language.py | 7 ++++++-
 tests/openvino/test_modeling.py                    | 6 +-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 1c0e35cca2..813b9ace2e 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -695,8 +695,10 @@ def forward(
         image_grid_thw=None,
         video_grid_thw=None,
         rope_deltas=None,
+        images=None,
         **kwargs,
     ):
+        pixel_values = pixel_values if pixel_values is not None else images
         inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings(
             input_ids,
             pixel_values,
@@ -794,6 +796,9 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
+        if pixel_values is None:
+            pixel_values = kwargs.get("images")
+
         model_inputs.update(
             {
                 "position_ids": position_ids,
@@ -1907,7 +1912,7 @@ def preprocess_inputs(
         attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
         result = {"input_ids": input_ids, "attention_mask": attention_mask}
         if image is not None:
-            result["pixel_values"] = processor(images=[image], return_tensors="pt")["pixel_values"]
+            result["images"] = processor(images=[image], return_tensors="pt")["pixel_values"]
         return result
 
 
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 57d4b64764..097f20991a 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -2182,11 +2182,7 @@ def test_compare_to_transformers(self, model_arch):
         ov_model.clear_requests()
         self._check_device_and_request(ov_model, test_device, False)
 
-        # nanollava pixel_values input named as images
-        if model_arch == "nanollava":
-            pixel_values = transformers_inputs.pop("pixel_values", None)
-            transformers_inputs["images"] = pixel_values
-        # pytorch minicpmv is not designed to be used via forward
+        # pytorch minicpmv and internvl2 is not designed to be used via forward
         if model_arch not in ["minicpmv", "internvl2"]:
             set_seed(SEED)
             ov_outputs = ov_model(**inputs)

From 5fda99f93c644fb5188543bf6f73f670d2ea2c82 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 29 Jan 2025 18:35:50 +0400
Subject: [PATCH 2/4] update test refs

---
 tests/openvino/test_exporters_cli.py | 2 +-
 tests/openvino/test_quantization.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 6d1bf5eb5e..04478a4445 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -197,7 +197,7 @@ class OVCLIExportTestCase(unittest.TestCase):
                     "image-text-to-text",
                     "nanollava",
                     "int4 --group-size 8 --ratio 0.8 --trust-remote-code",
-                    [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
+                    [{"int8": 14, "int4": 16}, {"int8": 15}, {"int8": 1}],
                 ),
                 (
                     "image-text-to-text",
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index c4c0ff247d..baf28ba72c 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -466,7 +466,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                         tokenizer=MODEL_NAMES["nanollava"],
                         trust_remote_code=True,
                     ),
-                    [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
+                    [{"int8": 14, "int4": 16}, {"int8": 15}, {"int8": 1}],
                 ),
             ]
         )

From 4d0a522632ce695829af05bea1d749cdb2b9ddf6 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 29 Jan 2025 19:03:01 +0400
Subject: [PATCH 3/4] properly fix quantization

---
 optimum/intel/openvino/modeling_visual_language.py | 2 ++
 tests/openvino/test_exporters_cli.py               | 2 +-
 tests/openvino/test_quantization.py                | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 813b9ace2e..eb94afadc5 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -1738,6 +1738,8 @@ def get_multimodal_embeddings(
         vision_embeds = None
         IGNORE_INDEX = -100
         IMAGE_TOKEN_INDEX = -200
+        if pixel_values is None and "images" in kwargs:
+            pixel_values = kwargs["images"]
         if pixel_values is not None:
             vision_embeds = self.get_vision_embeddings(pixel_values, input_ids=input_ids, **kwargs)
         if vision_embeds is None:
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 04478a4445..6d1bf5eb5e 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -197,7 +197,7 @@ class OVCLIExportTestCase(unittest.TestCase):
                     "image-text-to-text",
                     "nanollava",
                     "int4 --group-size 8 --ratio 0.8 --trust-remote-code",
-                    [{"int8": 14, "int4": 16}, {"int8": 15}, {"int8": 1}],
+                    [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
                 ),
                 (
                     "image-text-to-text",
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index baf28ba72c..c4c0ff247d 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -466,7 +466,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                         tokenizer=MODEL_NAMES["nanollava"],
                         trust_remote_code=True,
                     ),
-                    [{"int8": 14, "int4": 16}, {"int8": 15}, {"int8": 1}],
+                    [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}],
                 ),
             ]
         )

From 2927556de40eaac7862349b2f792d39c5adfab70 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 29 Jan 2025 19:06:07 +0400
Subject: [PATCH 4/4] Update tests/openvino/test_modeling.py

---
 tests/openvino/test_modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 097f20991a..abe3d7d363 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -2182,7 +2182,7 @@ def test_compare_to_transformers(self, model_arch):
         ov_model.clear_requests()
         self._check_device_and_request(ov_model, test_device, False)
 
-        # pytorch minicpmv and internvl2 is not designed to be used via forward
+        # pytorch minicpmv and internvl2 are not designed to be used via forward
         if model_arch not in ["minicpmv", "internvl2"]:
             set_seed(SEED)
             ov_outputs = ov_model(**inputs)