Add transformers 4.49 support (huggingface#1172)

echarlaix · web-flow · commit 4f79e05e2ea8 · 2025-02-26T18:28:04.000+01:00
* transformers 4.49

* fix qwen2vl patcher

* disable tests for models incompatibles with 4.49

* fix

* fix

* skip tests

* disable test

* disable test

* udpate expected tests op quantized

* fix

* update quant op tests

* add back test

* fix pattern

* style

* disable

* add minicpmv back
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -3935,14 +3935,28 @@ def __enter__(self):
         # Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L390
         # added attention_mask input instead of internal calculation (unsupported by tracing due to cycle with dynamic len)
         def sdpa_attn_forward(
-            self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,
+            rotary_pos_emb: torch.Tensor = None,
+            position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         ) -> torch.Tensor:
             from transformers.models.qwen2_vl.modeling_qwen2_vl import apply_rotary_pos_emb_vision
 
             seq_length = hidden_states.shape[0]
             q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
-            q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
-            k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+            if is_transformers_version(">=", "4.49"):
+                if position_embeddings is None:
+                    emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+                    cos = emb.cos().float()
+                    sin = emb.sin().float()
+                else:
+                    cos, sin = position_embeddings
+                q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+            else:
+                q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+                k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
 
             q = q.transpose(0, 1)
             k = k.transpose(0, 1)
diff --git a/setup.py b/setup.py
@@ -29,7 +29,7 @@
 INSTALL_REQUIRE = [
     "torch>=1.11",
     "optimum~=1.24",
-    "transformers>=4.36,<4.49",
+    "transformers>=4.36,<4.50",
     "datasets>=1.4.0",
     "sentencepiece",
     "setuptools",
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -1039,6 +1039,13 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
+
+        # TODO: add back once dtype fixed everywhere
+        # https://huggingface.co/katuni4ka/tiny-random-chatglm2/blob/main/modeling_chatglm.py#L720
+        # https://huggingface.co/katuni4ka/tiny-random-chatglm2/blob/main/modeling_chatglm.py#L759
+        if model_arch in {"chatglm", "glm4"} and is_transformers_version(">=", "4.49"):
+            self.skipTest("Incompatible modeling code")
+
         not_stateful = []
         if is_openvino_version("<", "2024.0"):
             not_stateful.append("mixtral")
@@ -1117,6 +1124,11 @@ def test_compare_to_transformers(self, model_arch):
         )
 
         ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
+
+        # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current mdoeling incompatible with transformers >= v4.49
+        if model_arch in {"minicpm", "minicpm3", "arctic", "deepseek"} and is_transformers_version(">=", "4.49"):
+            self.skipTest("Incompatible modeling code")
+
         additional_inputs = {}
         # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache,
         # align cache representation in torch model
@@ -2119,6 +2131,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"]
     if is_transformers_version(">=", "4.45.0"):
         SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"]
+
     if is_transformers_version(">=", "4.46.0"):
         SUPPORTED_ARCHITECTURES += ["maira2"]
 
@@ -2220,6 +2233,11 @@ def test_compare_to_transformers(self, model_arch):
         set_seed(SEED)
         ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
         set_seed(SEED)
+
+        # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged for all models as current mdoeling incompatible with transformers >= v4.49
+        if model_arch in {"phi3_v", "nanollava"} and is_transformers_version(">=", "4.49"):
+            self.skipTest("Incompatible modeling code")
+
         with torch.no_grad():
             transformers_outputs = transformers_model.generate(**transformers_inputs, generation_config=gen_config)
 
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -91,6 +91,8 @@
     "text-classification": ("glue", "sst2", "sentence"),
 }
 
+pattern_prefix = "^__module.model.model" if is_transformers_version(">=", "4.49") else "^__module.model"
+
 
 class OVQuantizerTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_TORCH_MODEL = (
@@ -158,12 +160,12 @@ class OVQuantizerTest(unittest.TestCase):
                     dtype="nf4",
                     group_size=16,
                     ratio=0.5,
-                    ignored_scope={"patterns": ["^__module.model.layers.0.self_attn"]},
+                    ignored_scope={"patterns": [f"{pattern_prefix}.layers.0.self_attn"]},
                 ),
                 full_quantization_config=OVQuantizationConfig(
-                    dtype="f8e4m3", ignored_scope={"patterns": ["^__module.model.layers.0.mlp"]}
+                    dtype="f8e4m3", ignored_scope={"patterns": [f"{pattern_prefix}.layers.0.mlp"]}
                 ),
-                ignored_scope={"patterns": ["^__module.model.layers.1.self_attn"]},
+                ignored_scope={"patterns": [f"{pattern_prefix}.layers.1.self_attn"]},
                 dataset="wikitext2",
                 num_samples=1,
             ),
@@ -183,12 +185,12 @@ class OVQuantizerTest(unittest.TestCase):
                     dtype="nf4",
                     group_size=16,
                     ratio=0.5,
-                    ignored_scope={"patterns": ["^__module.model.layers.0.self_attn"]},
+                    ignored_scope={"patterns": [f"{pattern_prefix}.layers.0.self_attn"]},
                 ),
                 full_quantization_config=OVQuantizationConfig(
-                    dtype="f8e5m2", ignored_scope={"patterns": ["^__module.model.layers.0.mlp"]}
+                    dtype="f8e5m2", ignored_scope={"patterns": [f"{pattern_prefix}.layers.0.mlp"]}
                 ),
-                ignored_scope={"patterns": ["^__module.model.layers.1.self_attn"]},
+                ignored_scope={"patterns": [f"{pattern_prefix}.layers.1.self_attn"]},
                 dataset="wikitext2",
                 num_samples=1,
             ),
@@ -435,7 +437,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 sensitivity_metric="mean_activation_magnitude",
                 dataset="c4",
             ),
-            [{"int8": 14, "int4": 25}],
+            [{"int8": 18, "int4": 23}] if is_transformers_version(">=", "4.49") else [{"int8": 14, "int4": 25}],
         ),
         (
             OVModelForCausalLM,
@@ -449,7 +451,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 sensitivity_metric="mean_activation_magnitude",
                 dataset=["one two, " * i for i in range(10)],
             ),
-            [{"int8": 16, "int4": 24}],
+            [{"int8": 18, "int4": 23}] if is_transformers_version(">=", "4.49") else [{"int8": 16, "int4": 24}],
         ),
         (
             OVModelForCausalLM,
@@ -612,21 +614,23 @@ class OVWeightCompressionTest(unittest.TestCase):
                     ),
                     [{"int8": 8, "int4": 22}, {"int8": 1}, {"int8": 11}],
                 ),
-                (
-                    OVModelForVisualCausalLM,
-                    "phi3_v",
-                    True,
-                    dict(
-                        bits=4,
-                        group_size=16,
-                        dataset="contextual",
-                        ratio=0.8,
-                        sensitivity_metric="mean_activation_magnitude",
-                        num_samples=1,
-                        trust_remote_code=True,
-                    ),
-                    [{"int8": 4, "int4": 14}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
-                ),
+                # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-phi3-vision/blob/main/processing_phi3_v.py#L313 modified to add chat_template
+                # currently incompatible with transformers >= v4.49
+                # (
+                #     OVModelForVisualCausalLM,
+                #     "phi3_v",
+                #     True,
+                #     dict(
+                #         bits=4,
+                #         group_size=16,
+                #         dataset="contextual",
+                #         ratio=0.8,
+                #         sensitivity_metric="mean_activation_magnitude",
+                #         num_samples=1,
+                #         trust_remote_code=True,
+                #     ),
+                #     [{"int8": 4, "int4": 14}, {"int8": 1}, {"int8": 7}, {"int8": 2}],
+                # ),
                 (
                     OVModelForVisualCausalLM,
                     "qwen2_vl",