update tests and add comments to patcher

eaidova · eaidova · commit ecaa78b3d40d · 2025-02-27T11:27:03.000+04:00
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -132,11 +132,10 @@ def init_model_configs():
         "transformers",
         "Qwen2VLForConditionalGeneration",
     )
-    if is_transformers_version(">", "4.48.99"):
-        TasksManager._CUSTOM_CLASSES[("pt", "qwen2-5-vl", "image-text-to-text")] = (
-            "transformers",
-            "Qwen2_5_VLForConditionalGeneration",
-        )
+    TasksManager._CUSTOM_CLASSES[("pt", "qwen2-5-vl", "image-text-to-text")] = (
+        "transformers",
+        "Qwen2_5_VLForConditionalGeneration",
+    )
     TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
         "image-text-to-text"
     ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"]
@@ -2716,6 +2715,8 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
 
 @register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text"], library_name="transformers")
 class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.49.0")
+
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -3956,7 +3956,8 @@ def block_forward(self, hidden_states, attention_mask, rotary_pos_emb) -> torch.
             return hidden_states
 
     else:
-
+        # Modified from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L391
+        # added attention_mask input instead of internal calculation (unsupported by tracing due to cycle with dynamic len)
         def sdpa_attn_forward(
             self,
             hidden_states: torch.Tensor,
@@ -4001,6 +4002,8 @@ def apply_rotary_pos_emb_vision(
             attn_output = self.proj(attn_output)
             return attn_output
 
+        # Modified from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L446
+        # added attention_mask input propagation to self.attn
         def block_forward(
             self,
             hidden_states,
@@ -4069,8 +4072,9 @@ def __init__(
 
         model.__orig_forward = model.forward
 
-        # Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1118
-        # added attention_mask input instead cu_lens for its internal calculation model (unsupported by tracing due to cycle with dynamic len)
+        # Modified from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L405
+        # added attention_mask and window_attention_mask inputs instead cu_lens and window_cu_lens processing for its internal calculation model
+        # (unsupported by tracing due to cycle with dynamic len)
         # separated patch_embed and rot_pos_emb calls for performing as part of another model
         def image_embed_forward(
             self,
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -2134,7 +2134,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
 
     if is_transformers_version(">=", "4.46.0"):
         SUPPORTED_ARCHITECTURES += ["maira2"]
-    
+
     if is_transformers_version(">=", "4.49.0"):
         SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
     TASK = "image-text-to-text"
@@ -2160,7 +2160,7 @@ def get_transformer_model_class(self, model_arch):
             from transformers import Qwen2VLForConditionalGeneration
 
             return Qwen2VLForConditionalGeneration
-        if model_arch == "qwen_25_vl":
+        if model_arch == "qwen2_5_vl":
             from transformers import Qwen2_5_VLForConditionalGeneration
 
             return Qwen2_5_VLForConditionalGeneration