huggingface
diff --git a/‎.github/workflows/test_openvino_slow.yml
+8-2 b/‎.github/workflows/test_openvino_slow.yml
+8-2
diff --git a/‎docs/source/openvino/models.mdx
+1 b/‎docs/source/openvino/models.mdx
+1
diff --git a/‎optimum/exporters/openvino/model_configs.py
+65-2 b/‎optimum/exporters/openvino/model_configs.py
+65-2
diff --git a/‎optimum/exporters/openvino/model_patcher.py
+99-2 b/‎optimum/exporters/openvino/model_patcher.py
+99-2
diff --git a/‎optimum/exporters/openvino/utils.py
+1 b/‎optimum/exporters/openvino/utils.py
+1
@@ -31,6 +31,8 @@ jobs:
             os: "ubuntu-22.04"
           - transformers-version: "4.45.0"
             os: "ubuntu-22.04"
+          - transformers-version: "main"
+            os: "ubuntu-22.04"
 
     runs-on: ${{ matrix.os }}
 
@@ -50,14 +52,18 @@ jobs:
           pip install .[openvino,tests] transformers[testing]
           pip uninstall -y nncf
 
-      - if: ${{ matrix.transformers-version != 'latest' }}
+      - if: ${{ matrix.transformers-version != 'latest' && matrix.transformers-version != 'main' }} 
         name: Install specific dependencies and versions required for older transformers
         run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
 
-      - if: ${{ matrix.transformers-version == 'latest' && matrix.os != 'windows-2019' }}
+      - if: ${{ matrix.transformers-version == 'latest' && matrix.os != 'windows-2019' ||  matrix.transformers-version == 'main' }}
         name: Install auto-gptq, autoawq
         run: |
           pip install auto-gptq "autoawq<0.2.8" --extra-index-url https://download.pytorch.org/whl/cpu
+  
+      - if: ${{ matrix.transformers-version == 'main' }}
+        name: Install transformers from repository
+        run: pip install git+https://github.com/huggingface/transformers.git
 
       - name: Pip freeze
         run: pip freeze
 
@@ -62,6 +62,7 @@ Here is the list of the supported architectures :
 - GPT-NeoX-Japanese
 - Gemma
 - Gemma2
+- Gemma3
 - GOT-OCR 2.0
 - Granite
 - GraniteMoE
 
@@ -73,13 +73,14 @@
     BaichuanModelPatcher,
     ChatGLMModelPatcher,
     CodeGenModelPatcher,
+    CommonImageEmbeddingsModelPatcher,
     DBRXModelPatcher,
     DeciLMModelPatcher,
     DeepseekPatcher,
     FalconModelPatcher,
     FluxTransfromerModelPatcher,
     Gemma2ModelPatcher,
-    GotOCR2ImageEmbeddingsModelPatcher,
+    Gemma3LMModelPatcher,
     GptBigCodeModelPatcher,
     GptJModelPatcher,
     GptNeoModelPatcher,
@@ -143,6 +144,10 @@ def init_model_configs():
         "transformers",
         "AutoModelForVision2Seq",
     )
+    TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = (
+        "transformers",
+        "Gemma3ForConditionalGeneration",
+    )
 
     TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[
         "image-text-to-text"
@@ -1141,6 +1146,21 @@ def patch_model_for_export(
         return Gemma2ModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+@register_in_tasks_manager(
+    "gemma3-text",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.50.0")
+
+
 class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
     def __init__(
         self,
@@ -1402,6 +1422,10 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
             inputs_embed_shape
         )
         dummy_inputs["inputs_embeds"] = inputs_embeds
+        if "token_type_ids" in self.inputs:
+            dummy_inputs["token_type_ids"] = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[
+                0
+            ].random_int_tensor(input_ids.shape, min_value=0, max_value=2)
         return dummy_inputs
 
 
@@ -3014,4 +3038,43 @@ def patch_model_for_export(
         model_kwargs = model_kwargs or {}
         if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
             return super().patch_model_for_export(model, model_kwargs)
-        return GotOCR2ImageEmbeddingsModelPatcher(self, model, model_kwargs)
+        return CommonImageEmbeddingsModelPatcher(self, model, model_kwargs)
+
+
+@register_in_tasks_manager("gemma3", *["image-text-to-text"], library_name="transformers")
+class Gemma3OpenVINOConfig(LlavaOpenVINOConfig):
+    MIN_TRANSFORMERS_VERSION = "4.50.0"
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return super().patch_model_for_export(model, model_kwargs)
+        return CommonImageEmbeddingsModelPatcher(self, model, model_kwargs)
+
+    def with_behavior(
+        self,
+        behavior: Union[str, LlavaConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
+            behavior = LlavaConfigBehavior(behavior)
+
+        if behavior == LlavaConfigBehavior.LANGUAGE:
+            model_type = self._orig_config.text_config.model_type
+            return get_vlm_text_generation_config(
+                model_type,
+                self._orig_config.text_config,
+                self.int_dtype,
+                self.float_dtype,
+                model_patcher=Gemma3LMModelPatcher,
+                inputs_update={"token_type_ids": {0: "batch_size", 1: "sequence_length"}},
+            )
+        return super().with_behavior(behavior)
@@ -2803,7 +2803,6 @@ def patched_forward(*args, **kwargs):
 
             signature = inspect.signature(self.orig_forward)
             args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs)
-
             return_legacy_cache = False
             pkv_in_args = False
             legacy_pkv = None
@@ -4407,7 +4406,7 @@ def __init__(
         super().__init__(config, model, model_kwargs)
 
 
-class GotOCR2ImageEmbeddingsModelPatcher(ModelPatcher):
+class CommonImageEmbeddingsModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4416,9 +4415,107 @@ def __init__(
     ):
         model.__orig_forward = model.forward
         # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321
         model.forward = model.get_image_features
         super().__init__(config, model, model_kwargs)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         self._model.forward = self._model.__orig_forward
+
+
+# Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1147
+def _gemma3_mm_update_causal_mask(
+    self, attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training: bool = False
+):
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted
+        # form and requires no inversion or slicing.
+        return attention_mask
+
+    min_dtype = torch.finfo(torch.float16).min
+    inputs_lead_dim, sequence_length = input_tensor.shape[:2]
+    target_length = (
+        attention_mask.shape[-1]
+        if isinstance(attention_mask, torch.Tensor)
+        else cache_position[0] + sequence_length + 1
+    )
+
+    causal_mask = torch.full(
+        (sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device
+    )
+
+    # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+    if sequence_length != 1:
+        causal_mask = torch.triu(causal_mask, diagonal=1)
+
+    causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+    causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+
+    # Apply bidirectional mask on images if token type ids are provided
+    if token_type_ids is not None and sequence_length != 1:
+        token_type_mask = token_type_ids.unsqueeze(1) == token_type_ids.unsqueeze(2)
+        token_type_mask[token_type_ids == 0] = False  # if text token do not change anything
+        token_type_mask = token_type_mask.unsqueeze(1).to(causal_mask.device, dtype=torch.bool)
+        causal_mask = causal_mask.clone()
+        causal_mask[:, :, :, :sequence_length] = causal_mask[:, :, :, :sequence_length].masked_fill(
+            token_type_mask, 0.0
+        )
+
+    if attention_mask is not None:
+        causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+        mask_length = attention_mask.shape[-1]
+
+        # Then apply padding mask (will mask pad tokens)
+        padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+        padding_mask = padding_mask == 0
+        causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
+
+    return causal_mask
+
+
+class Gemma3LMModelPatcher(DecoderModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        model.__orig_forward = model.forward
+        model._update_causal_mask_mm = types.MethodType(_gemma3_mm_update_causal_mask, model)
+
+        # Difference from original:
+        # uses Dynamic cache from legacy cache instead of HybridCache
+        # calculate causal mask from multimodal
+        def forward(self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds):
+            from transformers.cache_utils import DynamicCache
+
+            pkv = DynamicCache.from_legacy_cache(past_key_values)
+
+            past_seen_tokens = past_key_values[0][0].shape[-2]
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+            causal_mask = self._update_causal_mask_mm(
+                attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds
+            )
+
+            result = self.__orig_forward(
+                input_ids=None,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                cache_position=cache_position,
+                past_key_values=pkv,
+                inputs_embeds=inputs_embeds,
+            )
+            upd_pkv = result["past_key_values"]
+            result["past_key_values"] = upd_pkv.to_legacy_cache()
+            return result
+
+        model.forward = types.MethodType(forward, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
@@ -229,6 +229,7 @@ def get_submodels(model):
     "qwen2-vl",
     "qwen2-5-vl",
     "got-ocr2",
+    "gemma3",
 ]
Original file line number	Diff line number	Diff line change
`@@ -229,6 +229,7 @@ def get_submodels(model):`
`229`	`229`	`"qwen2-vl",`
`230`	`230`	`"qwen2-5-vl",`
`231`	`231`	`"got-ocr2",`
	`232`	`+ "gemma3",`
`232`	`233`	`]`
`233`	`234`
`234`	`235`