add tests and docs

eaidova · eaidova · commit 27f392e58842 · 2025-03-13T22:22:57.000+04:00
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -62,6 +62,7 @@ Here is the list of the supported architectures :
 - GPT-NeoX-Japanese
 - Gemma
 - Gemma2
+- Gemma3
 - Granite
 - GraniteMoE
 - Hubert
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -1422,6 +1422,10 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
             inputs_embed_shape
         )
         dummy_inputs["inputs_embeds"] = inputs_embeds
+        if "token_type_ids" in self.inputs:
+            dummy_inputs["token_type_ids"] = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[
+                0
+            ].random_int_tensor(input_ids.shape, min_value=0, max_value=2)
         return dummy_inputs
 
 
@@ -3058,5 +3062,6 @@ def with_behavior(
                 self.int_dtype,
                 self.float_dtype,
                 model_patcher=Gemma3LMModelPatcher,
+                inputs_update={"token_type_ids": {0: "batch_size", 1: "past_sequence_length + 1"}},
             )
         return super().with_behavior(behavior)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -4414,6 +4414,7 @@ def __init__(
         model_kwargs: Dict[str, Any],
     ):
         model.__orig_forward = model.forward
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321
         model.forward = model.get_image_features
         super().__init__(config, model, model_kwargs)
 
@@ -4422,23 +4423,94 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class Gemma3LMModelPatcher(Gemma2ModelPatcher):
+# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1147
+def _gemma3_mm_update_causal_mask(
+    self, attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training: bool = False
+):
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted
+        # form and requires no inversion or slicing.
+        return attention_mask
+
+    min_dtype = torch.finfo(torch.float16).min
+    inputs_lead_dim, sequence_length = input_tensor.shape[:2]
+    target_length = (
+        attention_mask.shape[-1]
+        if isinstance(attention_mask, torch.Tensor)
+        else cache_position[0] + sequence_length + 1
+    )
+
+    causal_mask = torch.full(
+        (sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device
+    )
+
+    # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+    if sequence_length != 1:
+        causal_mask = torch.triu(causal_mask, diagonal=1)
+
+    causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+    causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+
+    # Apply bidirectional mask on images if token type ids are provided
+    if token_type_ids is not None and sequence_length != 1:
+        token_type_mask = token_type_ids.unsqueeze(1) == token_type_ids.unsqueeze(2)
+        token_type_mask[token_type_ids == 0] = False  # if text token do not change anything
+        token_type_mask = token_type_mask.unsqueeze(1).to(causal_mask.device, dtype=torch.bool)
+        causal_mask = causal_mask.clone()
+        causal_mask[:, :, :, :sequence_length] = causal_mask[:, :, :, :sequence_length].masked_fill(
+            token_type_mask, 0.0
+        )
+
+    if attention_mask is not None:
+        causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+        mask_length = attention_mask.shape[-1]
+
+        # Then apply padding mask (will mask pad tokens)
+        padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+        padding_mask = padding_mask == 0
+        causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
+
+    return causal_mask
+
+
+class Gemma3LMModelPatcher(DecoderModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
         model: Union["PreTrainedModel", "TFPreTrainedModel"],
         model_kwargs: Optional[Dict[str, Any]] = None,
     ):
         model.__orig_forward = model.forward
+        model._update_causal_mask_mm = types.MethodType(_gemma3_mm_update_causal_mask, model)
+
+        # Difference from original:
+        # uses Dynamic cache from legacy cache instead of HybridCache
+        # calculate causal mask from multimodal
+        def forward(self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds):
+            from transformers.cache_utils import DynamicCache
+
+            pkv = DynamicCache.from_legacy_cache(past_key_values)
 
-        def forward(self, attention_mask, position_ids, past_key_values, inputs_embeds):
-            return self.__orig_forward(
+            past_seen_tokens = past_key_values[0][0].shape[-2]
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+            causal_mask = self._update_causal_mask_mm(
+                attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds
+            )
+
+            result = self.__orig_forward(
                 input_ids=None,
-                attention_mask=attention_mask,
+                attention_mask=causal_mask,
                 position_ids=position_ids,
-                past_key_values=past_key_values,
+                cache_position=cache_position,
+                past_key_values=pkv,
                 inputs_embeds=inputs_embeds,
             )
+            upd_pkv = result["past_key_values"]
+            result["past_key_values"] = upd_pkv.to_legacy_cache()
+            return result
 
         model.forward = types.MethodType(forward, model)
         super().__init__(config, model, model_kwargs)
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -130,6 +130,7 @@ def prepare_inputs(
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         position_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
         **kwargs,
     ):
         batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
@@ -177,6 +178,11 @@ def prepare_inputs(
 
             inputs["position_ids"] = position_ids
 
+        if "token_type_ids" in self.input_names:
+            if token_type_ids is None:
+                token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int)
+            inputs["token_type_ids"] = token_type_ids
+
         if "beam_idx" in self.input_names:
             inputs["beam_idx"] = (
                 self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int)
@@ -736,6 +742,7 @@ def forward(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             position_ids=position_ids,
+            token_type_ids=token_type_ids,
             past_key_values=past_key_values,
             **kwargs,
         )
@@ -804,6 +811,11 @@ def prepare_inputs_for_generation(
         if attention_mask is not None and position_ids is None:
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
+
+            # position_ids in Gemma3 are 1-indexed
+            if self.config.model_type == "gemma3":
+                position_ids += 1
+
             if past_key_values is not None:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
 
@@ -829,6 +841,7 @@ def prepare_inputs_for_generation(
                 "pixel_values_videos": kwargs.get("pixel_values_videos"),
                 "image_grid_thw": kwargs.get("image_grid_thw"),
                 "video_grid_thw": kwargs.get("video_grid_thw"),
+                "token_type_ids": kwargs.get("token_type_ids"),
             }
         )
         return model_inputs
@@ -3119,6 +3132,7 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
     def merge_vision_text_embeddings(
         self, vision_embeds, inputs_embeds, input_ids=None, attention_mask=None, position_ids=None, **kwargs
     ):
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1323-L1339
         image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
         inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
         if input_ids is None:
@@ -3163,6 +3177,25 @@ def preprocess_inputs(
         inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
         return inputs
 
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            num_new_tokens=num_new_tokens,
+        )
+
+        # Token type ids used only for first inference mask generation
+        model_kwargs.pop("token_type_ids", None)
+
+        return model_kwargs
+
 
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -1013,6 +1013,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         if is_openvino_version(">=", "2024.6.0") and platform.system() != "Windows":
             SUPPORTED_ARCHITECTURES += ("mixtral_awq",)
 
+    if is_transformers_version(">", "4.49"):
+        SUPPORTED_ARCHITECTURES += ("gemma3-text",)
+
     GENERATION_LENGTH = 100
     REMOTE_CODE_MODELS = (
         "chatglm",
@@ -1112,7 +1115,7 @@ def test_compare_to_transformers(self, model_arch):
         gen_config = GenerationConfig(
             max_new_tokens=30,
             min_new_tokens=30,
-            num_beams=3,
+            num_beams=2,
             do_sample=False,
             eos_token_id=None,
         )
@@ -1126,7 +1129,7 @@ def test_compare_to_transformers(self, model_arch):
         additional_inputs = {}
         # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache,
         # align cache representation in torch model
-        if model_arch == "gemma2":
+        if model_arch in ["gemma2", "gemma3-text"]:
             patch_update_causal_mask(transformers_model, "4.43.0")
             transformers_model._supports_cache_class = True
             from transformers.cache_utils import DynamicCache
@@ -2143,6 +2146,8 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.49.0"):
         SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
         SUPPORT_VIDEO.append("qwen2_5_vl")
+    if is_transformers_version(">", "4.49"):
+        SUPPORTED_ARCHITECTURES += ["gemma3"]
     TASK = "image-text-to-text"
     REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2"]
 
@@ -2154,7 +2159,13 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     )
 
     def get_transformer_model_class(self, model_arch):
-        if is_transformers_version(">=", "4.46") and model_arch in ["llava", "llava_next", "qwen2_vl", "qwen2_5_vl"]:
+        if is_transformers_version(">=", "4.46") and model_arch in [
+            "llava",
+            "llava_next",
+            "qwen2_vl",
+            "qwen2_5_vl",
+            "gemma3",
+        ]:
             from transformers import AutoModelForImageTextToText
 
             return AutoModelForImageTextToText
@@ -2250,8 +2261,20 @@ def test_compare_to_transformers(self, model_arch):
         ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
         set_seed(SEED)
 
+        additional_inputs = {}
+        # gemma3 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache,
+        # align cache representation in torch model
+        if model_arch == "gemma3":
+            patch_update_causal_mask(transformers_model, "4.43.0")
+            transformers_model._supports_cache_class = True
+            from transformers.cache_utils import DynamicCache
+
+            additional_inputs = {"past_key_values": DynamicCache()}
+
         with torch.no_grad():
-            transformers_outputs = transformers_model.generate(**transformers_inputs, generation_config=gen_config)
+            transformers_outputs = transformers_model.generate(
+                **transformers_inputs, generation_config=gen_config, **additional_inputs
+            )
 
         # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
         if model_arch in ["minicpmv", "internvl2"]:
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -63,6 +63,8 @@
     "exaone": "katuni4ka/tiny-random-exaone",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
     "gemma2": "katuni4ka/tiny-random-gemma2",
+    "gemma3-text": "katuni4ka/tiny-random-gemma3-text",
+    "gemma3": "katuni4ka/tiny-random-gemma3",
     "falcon": "fxmarty/really-tiny-falcon-testing",
     "falcon-40b": "katuni4ka/tiny-random-falcon-40b",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",