huggingface
diff --git a/‎docs/source/openvino/export.mdx
+4-5 b/‎docs/source/openvino/export.mdx
+4-5
diff --git a/‎optimum/commands/export/openvino.py
+1-5 b/‎optimum/commands/export/openvino.py
+1-5
diff --git a/‎optimum/exporters/openvino/__main__.py
+6-1 b/‎optimum/exporters/openvino/__main__.py
+6-1
diff --git a/‎optimum/exporters/openvino/convert.py
+88-46 b/‎optimum/exporters/openvino/convert.py
+88-46
diff --git a/‎optimum/exporters/openvino/model_configs.py
+70 b/‎optimum/exporters/openvino/model_configs.py
+70
@@ -31,7 +31,7 @@ Check out the help for more options:
 
 ```text
 usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
-                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}]
+                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}]
                                    [--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
                                    [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
                                    [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -67,10 +67,9 @@ Optional arguments:
                         on your local machine arbitrary code present in the model repository.
   --weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
                         The weight format of the exported model.
-  --quant-mode {int8}
+  --quant-mode {int8,f8e4m3,f8e5m2}
                         Quantization precision mode. This is used for applying full model quantization including
-                        activations. The only currently supported choice is 'int8' for int8 quantization of both
-                        weights and activations.
+                        activations.
   --library {transformers,diffusers,timm,sentence_transformers,open_clip}
                         The library used to load the model before export. If not provided, will attempt to infer the
                         local checkpoint's library
@@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
 </Tip>
 
 
-Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below.
+Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Please see example below.
 
 ```bash
 optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo
 
@@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--quant-mode",
         type=str,
-        choices=["int8"],
+        choices=["int8", "f8e4m3", "f8e5m2"],
         default=None,
         help=(
             "Quantization precision mode. This is used for applying full model quantization including activations. "
-            "The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
         ),
     )
     optional_group.add_argument(
@@ -365,9 +364,6 @@ def run(self):
                 quantization_config["trust_remote_code"] = self.args.trust_remote_code
             ov_config = OVConfig(quantization_config=quantization_config)
         else:
-            if self.args.quant_mode != "int8":
-                raise ValueError("Only 'int8' quantization mode is currently supported.")
-
             quantization_config = {
                 "weight_format": self.args.quant_mode,
                 "activation_format": self.args.quant_mode,
 
@@ -86,6 +86,7 @@ def infer_task(
                     revision=revision,
                     cache_dir=cache_dir,
                     token=token,
+                    library_name=library_name,
                 )
             except KeyError as e:
                 raise KeyError(
@@ -274,7 +275,11 @@ def main_export(
                 f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}."
             )
 
-        if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
+        if (
+            is_transformers_version(">=", "4.36")
+            and is_transformers_version("<=", "4.45.0")
+            and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
+        ):
             loading_kwargs["attn_implementation"] = "eager"
 
         # some models force flash_attn attention by default that does not support load model on cpu
 
@@ -28,10 +28,13 @@
 from openvino.tools.ovc import convert_model
 from optimum.exporters import TasksManager
 from optimum.exporters.utils import (
-    _get_submodels_and_export_configs as _default_get_submodels_and_export_configs,
+    DECODER_NAME,
+    ENCODER_NAME,
+    _get_submodels_for_export_encoder_decoder,
+    get_diffusion_models_for_export,
 )
 from optimum.exporters.utils import (
-    get_diffusion_models_for_export,
+    _get_submodels_and_export_configs as _default_get_submodels_and_export_configs,
 )
 from optimum.intel.utils.import_utils import (
     _diffusers_version,
@@ -43,7 +46,6 @@
     _torch_version,
     _transformers_version,
     compare_versions,
-    is_diffusers_version,
     is_openvino_tokenizers_version,
     is_openvino_version,
     is_tokenizers_version,
@@ -101,15 +103,18 @@ def _set_runtime_options(
 ):
     for model_name in models_and_export_configs.keys():
         _, sub_export_config = models_and_export_configs[model_name]
-        sub_export_config.runtime_options = {}
+        if not hasattr(sub_export_config, "runtime_options"):
+            sub_export_config.runtime_options = {}
         if (
-            "diffusers" in library_name
-            or "text-generation" in task
+            "text-generation" in task
             or ("image-text-to-text" in task and model_name == "language_model")
+            or getattr(sub_export_config, "stateful", False)
         ):
             sub_export_config.runtime_options["ACTIVATIONS_SCALE_FACTOR"] = "8.0"
         if not quantized_model and (
-            "text-generation" in task or ("image-text-to-text" in task and model_name == "language_model")
+            "text-generation" in task
+            or ("image-text-to-text" in task and model_name == "language_model")
+            or getattr(sub_export_config, "stateful", False)
         ):
             sub_export_config.runtime_options["KV_CACHE_PRECISION"] = "f16"
 
@@ -642,10 +647,14 @@ def export_from_model(
 
         logger.info(f"Automatic task detection to: {task}.")
 
+    is_encoder_decoder = getattr(getattr(model, "config", {}), "is_encoder_decoder", False)
+    model_type = getattr(getattr(model, "config", {}), "model_type", "")
     stateful = stateful and (
-        ensure_export_task_support_stateful(task)
-        or ensure_model_type_support_stateful(getattr(getattr(model, "config", {}), "model_type", ""))
+        ensure_export_task_support_stateful(task) or ensure_model_type_support_stateful(model_type)
     )
+
+    if stateful and is_encoder_decoder and not getattr(model, "_supports_cache_class", False):
+        stateful = False
     # TODO: support onnx_config.py in the model repo
     if custom_architecture and custom_export_configs is None:
         raise ValueError(
@@ -687,6 +696,11 @@ def export_from_model(
     if library_name == "diffusers":
         export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino")
         stateful_submodels = False
+    elif stateful and is_encoder_decoder and not custom_architecture:
+        export_config, models_and_export_configs = _get_encoder_decoder_stateful_models_for_export(
+            model=model, task=task, preprocessors=preprocessors, library_name=library_name, _variant="default"
+        )
+        stateful_submodels = [False, True]
     else:
         logging.disable(logging.INFO)
         export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs(
@@ -999,45 +1013,29 @@ def _get_submodels_and_export_configs(
 def get_diffusion_models_for_export_ext(
     pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino"
 ):
-    if is_diffusers_version(">=", "0.29.0"):
-        from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
-
-        sd3_pipes = [StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline]
-        if is_diffusers_version(">=", "0.30.0"):
-            from diffusers import StableDiffusion3InpaintPipeline
-
-            sd3_pipes.append(StableDiffusion3InpaintPipeline)
-
-        is_sd3 = isinstance(pipeline, tuple(sd3_pipes))
-    else:
-        is_sd3 = False
-
-    if is_diffusers_version(">=", "0.30.0"):
-        from diffusers import FluxPipeline
-
-        flux_pipes = [FluxPipeline]
-
-        if is_diffusers_version(">=", "0.31.0"):
-            from diffusers import FluxImg2ImgPipeline, FluxInpaintPipeline
-
-            flux_pipes.extend([FluxPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline])
-
-        if is_diffusers_version(">=", "0.32.0"):
-            from diffusers import FluxFillPipeline
-
-            flux_pipes.append(FluxFillPipeline)
-
-        is_flux = isinstance(pipeline, tuple(flux_pipes))
-    else:
-        is_flux = False
-
-    if not is_sd3 and not is_flux:
-        return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
-    if is_sd3:
+    is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL")
+    is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3")
+    is_flux = pipeline.__class__.__name__.startswith("Flux")
+    is_sd = pipeline.__class__.__name__.startswith("StableDiffusion") and not is_sd3
+    is_lcm = pipeline.__class__.__name__.startswith("LatentConsistencyModel")
+
+    if is_sd or is_sdxl or is_lcm:
+        models_for_export = get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
+        if is_sdxl and pipeline.vae.config.force_upcast:
+            models_for_export["vae_encoder"][1].runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "128.0"}
+            models_for_export["vae_decoder"][1].runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "128.0"}
+
+        # only SD 2.1 has overflow issue, it uses different prediction_type than other models
+        if is_sd and pipeline.scheduler.config.prediction_type == "v_prediction":
+            models_for_export["vae_encoder"][1].runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
+            models_for_export["vae_decoder"][1].runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
+
+    elif is_sd3:
         models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
-    else:
+    elif is_flux:
         models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)
-
+    else:
+        raise ValueError(f"Unsupported pipeline type `{pipeline.__class__.__name__}` provided")
     return None, models_for_export
 
 
@@ -1135,6 +1133,7 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
             int_dtype=int_dtype,
             float_dtype=float_dtype,
         )
+        export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
         models_for_export["text_encoder_3"] = (text_encoder_3, export_config)
 
     return models_for_export
@@ -1172,6 +1171,7 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype):
     transformer_export_config = export_config_constructor(
         pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
     )
+    transformer_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
     models_for_export["transformer"] = (transformer, transformer_export_config)
 
     # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
@@ -1187,6 +1187,7 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype):
     vae_encoder_export_config = vae_config_constructor(
         vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
     )
+    vae_encoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
     models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config)
 
     # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
@@ -1202,6 +1203,7 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype):
     vae_decoder_export_config = vae_config_constructor(
         vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype
     )
+    vae_decoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
     models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config)
 
     text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
@@ -1218,6 +1220,46 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype):
             int_dtype=int_dtype,
             float_dtype=float_dtype,
         )
+        export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
         models_for_export["text_encoder_2"] = (text_encoder_2, export_config)
 
     return models_for_export
+
+
+def _get_encoder_decoder_stateful_models_for_export(
+    model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    task: str,
+    _variant: str,
+    library_name: str,
+    int_dtype: str = "int64",
+    float_dtype: str = "fp32",
+    preprocessors: Optional[List[Any]] = None,
+):
+    export_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=model, exporter="openvino", task=task, library_name=library_name
+    )
+    export_config = export_config_constructor(
+        model.config,
+        int_dtype=int_dtype,
+        float_dtype=float_dtype,
+        preprocessors=preprocessors,
+        legacy=False,
+    )
+
+    export_config.variant = _variant
+    all_variants = "\n".join([f"    - {name}: {description}" for name, description in export_config.VARIANTS.items()])
+    logger.info(f"Using the export variant {export_config.variant}. Available variants are:\n{all_variants}")
+
+    models_for_export = _get_submodels_for_export_encoder_decoder(model, use_past=False)
+
+    encoder_export_config = export_config.with_behavior("encoder")
+    models_for_export[ENCODER_NAME] = (models_for_export[ENCODER_NAME], encoder_export_config)
+
+    decoder_export_config_with_past = export_config.with_behavior("decoder", use_past=True, use_past_in_inputs=True)
+
+    decoder_export_config_with_past.stateful = True
+    models_for_export[DECODER_NAME] = (
+        models_for_export[DECODER_NAME],
+        decoder_export_config_with_past,
+    )
+    return None, models_for_export
@@ -20,6 +20,7 @@
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, TFPreTrainedModel
 from transformers.utils import is_tf_available
 
+from optimum.exporters.onnx.base import ConfigBehavior
 from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
 from optimum.exporters.onnx.model_configs import (
     CLIPOnnxConfig,
@@ -38,8 +39,10 @@
     MistralOnnxConfig,
     MPTOnnxConfig,
     PhiOnnxConfig,
+    T5OnnxConfig,
     UNetOnnxConfig,
     VisionOnnxConfig,
+    WhisperOnnxConfig,
 )
 from optimum.exporters.onnx.model_patcher import ModelPatcher
 from optimum.exporters.tasks import TasksManager
@@ -102,6 +105,7 @@
     Qwen2VLVisionEmbMergerPatcher,
     QwenModelPatcher,
     RotaryEmbPatcher,
+    StatefulSeq2SeqDecoderPatcher,
     UpdateCausalMaskModelPatcher,
     XverseModelPatcher,
 )
@@ -2611,3 +2615,69 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return GptBigCodeModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager(
+    "whisper",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "audio-classification",
+        "automatic-speech-recognition",
+        "automatic-speech-recognition-with-past",
+    ],
+    library_name="transformers",
+)
+class WhisperOpenVINOConfig(WhisperOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER:
+            return StatefulSeq2SeqDecoderPatcher(self, model, model_kwargs)
+        return super().patch_model_for_export(model, model_kwargs)
+
+    @property
+    def inputs(self):
+        common_inputs = super().inputs
+        if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER:
+            common_inputs["decoder_input_ids"] = {0: "batch_size", 1: "decoder_sequence_length"}
+        return common_inputs
+
+
+@register_in_tasks_manager(
+    "t5",
+    *["feature-extraction", "feature-extraction-with-past", "text2text-generation", "text2text-generation-with-past"],
+    library_name="transformers",
+)
+class T5OpenVINOConfig(T5OnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER:
+            return StatefulSeq2SeqDecoderPatcher(self, model, model_kwargs)
+        return super().patch_model_for_export(model, model_kwargs)
+
+    @property
+    def inputs(self):
+        common_inputs = super().inputs
+        if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER:
+            common_inputs["decoder_input_ids"] = {0: "batch_size", 1: "decoder_sequence_length"}
+        return common_inputs
+
+
+@register_in_tasks_manager(
+    "mt5",
+    *["feature-extraction", "feature-extraction-with-past", "text2text-generation", "text2text-generation-with-past"],
+    library_name="transformers",
+)
+class MT5OpenVINOConfig(T5OpenVINOConfig):
+    pass
+
+
+@register_in_tasks_manager(
+    "longt5",
+    *["feature-extraction", "feature-extraction-with-past", "text2text-generation", "text2text-generation-with-past"],
+    library_name="transformers",
+)
+class LongT5OpenVINOConfig(T5OpenVINOConfig):
+    pass