optimum/exporters/openvino/convert.py

#  Copyright 2022 The HuggingFace Team. All rights reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import copy
import functools
import gc
import logging
import os
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

from transformers.generation import GenerationMixin
from transformers.utils import is_tf_available, is_torch_available

from openvino.runtime import Model, save_model
from openvino.runtime.exceptions import OVTypeError
from openvino.tools.ovc import convert_model
from optimum.exporters import TasksManager
from optimum.exporters.utils import (
    _get_submodels_and_export_configs as _default_get_submodels_and_export_configs,
)
from optimum.exporters.utils import (
    get_diffusion_models_for_export,
)
from optimum.intel.utils.import_utils import (
    _diffusers_version,
    _nncf_version,
    _open_clip_version,
    _optimum_intel_version,
    _optimum_version,
    _timm_version,
    _torch_version,
    _transformers_version,
    compare_versions,
    is_diffusers_version,
    is_openvino_tokenizers_version,
    is_tokenizers_version,
    is_transformers_version,
)
from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available

from ...intel.utils.import_utils import is_nncf_available
from ...intel.utils.modeling_utils import _infer_library_from_model_or_model_class
from .model_patcher import patch_model_with_bettertransformer
from .stateful import (
    ensure_export_task_support_stateful,
    ensure_model_type_support_stateful,
    ensure_stateful_is_available,
    patch_stateful,
)
from .utils import (
    MULTI_MODAL_TEXT_GENERATION_MODELS,
    OV_XML_FILE_NAME,
    _get_input_info,
    _get_open_clip_submodels_fn_and_export_configs,
    clear_class_registry,
    remove_none_from_dummy_inputs,
    save_config,
    save_preprocessors,
)


logger = logging.getLogger(__name__)

if is_torch_available():
    import torch.nn as nn
    from transformers.modeling_utils import PreTrainedModel

if is_diffusers_available():
    from diffusers import DiffusionPipeline, ModelMixin

if is_tf_available():
    from transformers.modeling_tf_utils import TFPreTrainedModel


if TYPE_CHECKING:
    from optimum.exporters.onnx.base import OnnxConfig
    from optimum.intel.openvino.configuration import OVConfig


def _set_runtime_options(
    models_and_export_configs: Dict[
        str,
        Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin", "DiffusionPipeline"], "OnnxConfig"],
    ],
    task: str,
    library_name: str,
    quantized_model: bool,
):
    for model_name in models_and_export_configs.keys():
        _, sub_export_config = models_and_export_configs[model_name]
        sub_export_config.runtime_options = {}
        if (
            "diffusers" in library_name
            or "text-generation" in task
            or ("image-text-to-text" in task and model_name == "language_model")
        ):
            sub_export_config.runtime_options["ACTIVATIONS_SCALE_FACTOR"] = "8.0"
        if not quantized_model and (
            "text-generation" in task or ("image-text-to-text" in task and model_name == "language_model")
        ):
            sub_export_config.runtime_options["KV_CACHE_PRECISION"] = "f16"


def _save_model(
    model,
    path: str,
    ov_config: Optional["OVConfig"] = None,
    library_name: Optional[str] = None,
    config: "OnnxConfig" = None,
):
    compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
    model = _add_version_info_to_model(model, library_name)

    runtime_options = config.runtime_options if hasattr(config, "runtime_options") else {}
    model = _add_runtime_options_to_rt_info(model, runtime_options)
    save_model(model, path, compress_to_fp16)
    del model
    gc.collect()


def export(
    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin", "DiffusionPipeline"],
    config: "OnnxConfig",
    output: Path,
    opset: Optional[int] = None,
    device: str = "cpu",
    input_shapes: Optional[Dict] = None,
    model_kwargs: Optional[Dict[str, Any]] = None,
    ov_config: Optional["OVConfig"] = None,
    stateful: bool = True,
    patch_16bit_model: bool = False,
    library_name: Optional[str] = None,
) -> Tuple[List[str], List[str]]:
    """
    Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation.

    Args:
        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
            The model to export.
        config ([`~exporters.onnx.config.OnnxConfig`]):
            The ONNX configuration associated with the exported model.
        output (`Path`):
            Directory to store the exported model.
        opset (`Optional[int]`, defaults to `None`):
            The version of the ONNX operator set to use.
        device (`str`, *optional*, defaults to `cpu`):
            The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
            export on CUDA devices.
        ov_config (`OVConfig`, *optional*):
            The configuration containing the parameters related to quantization.
        input_shapes (`Optional[Dict]`, defaults to `None`):
            If specified, allows to use specific shapes for the example input provided to the exporter.
        stateful (`bool`, defaults to `True`):
            Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models.

    Returns:
        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
        the ONNX configuration.
    """
    if not (is_torch_available() or is_tf_available()):
        raise ImportError(
            "Cannot convert because neither PyTorch nor TensorFlow are installed. "
            "Please install torch or tensorflow first."
        )

    if "diffusers" in str(model.__class__) and not is_diffusers_available():
        raise ImportError("The package `diffusers` is required to export diffusion models to OpenVINO.")

    if stateful:
        # This will be checked anyway after the model conversion, but checking it earlier will save time for a user if not suitable version is used
        stateful = ensure_stateful_is_available()

    if is_torch_available() and isinstance(model, nn.Module):
        return export_pytorch(
            model,
            config,
            opset,
            output,
            device=device,
            input_shapes=input_shapes,
            ov_config=ov_config,
            model_kwargs=model_kwargs,
            stateful=stateful,
            patch_16bit_model=patch_16bit_model,
            library_name=library_name,
        )

    elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
        output.parent.mkdir(parents=True, exist_ok=True)
        if opset is None:
            opset = config.DEFAULT_ONNX_OPSET
        if device == "cuda":
            raise RuntimeError("`tf2onnx` does not support export on CUDA device.")
        if input_shapes is not None:
            logger.info("`input_shapes` argument is not supported by the Tensorflow ONNX export and will be ignored.")
        return export_tensorflow(model, config, opset, output, ov_config=ov_config, library_name=library_name)

    else:
        raise RuntimeError(
            "You either provided a PyTorch model with only TensorFlow installed, or a TensorFlow model with only PyTorch installed."
        )


def export_tensorflow(
    model: Union["PreTrainedModel", "ModelMixin"],
    config: "OnnxConfig",
    opset: int,
    output: Path,
    ov_config: Optional["OVConfig"] = None,
    library_name: Optional[str] = None,
):
    """
    Export the TensorFlow model to OpenVINO format.

    Args:
        model (Union[): The model to export.
        config (OnnxConfig): The configuration of the model.
        opset (int): The ONNX opset version to use.
        output (Path): The path to save the model.

    Returns:
        input_names: list of input names from ONNX configuration
        output_names: list of output names from ONNX configuration
        bool:  True if the model was exported successfully.
    """
    from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx

    onnx_path = Path(output).with_suffix(".onnx")
    input_names, output_names = export_tensorflow_onnx(model, config, opset, onnx_path)
    ov_model = convert_model(str(onnx_path))

    library_name = _infer_library_from_model_or_model_class(model=model, library_name=library_name)

    _save_model(
        ov_model,
        output.parent / output,
        ov_config=ov_config,
        library_name=library_name,
        config=config,
    )
    del ov_model
    gc.collect()
    return input_names, output_names, True


def export_pytorch_via_onnx(
    model: Union["PreTrainedModel", "ModelMixin"],
    config: "OnnxConfig",
    opset: int,
    output: Path,
    device: str = "cpu",
    input_shapes: Optional[Dict] = None,
    model_kwargs: Optional[Dict[str, Any]] = None,
    ov_config: Optional["OVConfig"] = None,
    library_name: Optional[str] = None,
):
    """
    Exports a PyTorch model to an OpenVINO Intermediate Representation via ONNX export.

    Args:
        model ([`PreTrainedModel`]):
            The model to export.
        config ([`~exporters.onnx.config.OnnxConfig`]):
            The configuration associated with the exported model.
        opset (`int`):
            The version of the ONNX operator set to use.
        output (`Path`):
            Directory to store the exported model.
        device (`str`, defaults to `"cpu"`):
            The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
            export on CUDA devices.
        input_shapes (`optional[Dict]`, defaults to `None`):
            If specified, allows to use specific shapes for the example input provided to the exporter.
        model_kwargs (optional[Dict[str, Any]], defaults to `None`):
            Additional kwargs for model export.
        ov_config (`OVConfig`, *optional*):
            The configuration containing the parameters related to quantization.

    Returns:
        `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from
        the ONNX configuration and boolean flag - was legacy ONNX path were applied to model or not.
    """
    import torch

    from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx

    output = Path(output)
    orig_torch_onnx_export = torch.onnx.export
    torch.onnx.export = functools.partial(orig_torch_onnx_export, do_constant_folding=False)
    model.config.torchscript = False
    model.config.return_dict = True
    onnx_output = output.with_suffix(".onnx")
    input_names, output_names = export_pytorch_to_onnx(
        model, config, opset, onnx_output, device, input_shapes, model_kwargs
    )
    torch.onnx.export = orig_torch_onnx_export
    ov_model = convert_model(str(onnx_output))

    library_name = _infer_library_from_model_or_model_class(model=model, library_name=library_name)

    _save_model(
        ov_model,
        output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output,
        ov_config=ov_config,
        library_name=library_name,
        config=config,
    )
    del ov_model
    gc.collect()
    return input_names, output_names, True


def export_pytorch(
    model: Union["PreTrainedModel", "ModelMixin"],
    config: "OnnxConfig",
    opset: int,
    output: Path,
    device: str = "cpu",
    input_shapes: Optional[Dict] = None,
    model_kwargs: Optional[Dict[str, Any]] = None,
    ov_config: Optional["OVConfig"] = None,
    stateful: bool = False,
    patch_16bit_model: bool = False,
    library_name: Optional[str] = None,
) -> Tuple[List[str], List[str]]:
    """
    Exports a PyTorch model to an OpenVINO Intermediate Representation.

    Args:
        model ([`PreTrainedModel`]):
            The model to export.
        config ([`~exporters.onnx.config.OnnxConfig`]):
            The configuration associated with the exported model.
        opset (`int`):
            The version of the ONNX operator set to use.
        output (`Path`):
            Directory to store the exported model.
        device (`str`, defaults to `"cpu"`):
            The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
            export on CUDA devices.
        input_shapes (`optional[Dict]`, defaults to `None`):
            If specified, allows to use specific shapes for the example input provided to the exporter.
        model_kwargs (optional[Dict[str, Any]], defaults to `None`):
            Additional kwargs for model export
        ov_config (`OVConfig`, *optional*):
            The configuration containing the parameters related to quantization.
        stateful (`bool`, defaults to `False`):
            Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models.

    Returns:
        `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from
        the ONNX configuration and boolean flag - was legacy ONNX path were applied to model or not.
    """
    import torch
    from torch.utils._pytree import tree_map

    from optimum.exporters.utils import check_dummy_inputs_are_allowed

    logger.info(f"Using framework PyTorch: {torch.__version__}")
    output = Path(output)

    if stateful:
        # Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect
        # both of them are applied to demonstrate the best performance.
        # TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation.
        model = patch_model_with_bettertransformer(model)
        # TODO: Consider unpatching model after export is done in the end of this function.
        #       Now it is left as-is because the model is not expected to be used after call export_pytorch, and
        #       this function is one of the _internal_ steps in a bigger model conversion pipeline.

    with torch.no_grad():
        if hasattr(model, "config"):
            model.config.torchscript = False
            model.config.return_dict = True
        model.eval()

        # Check if we need to override certain configuration item
        if config.values_override is not None:
            logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
            for override_config_key, override_config_value in config.values_override.items():
                logger.info(f"\t- {override_config_key} -> {override_config_value}")
                setattr(model.config, override_config_key, override_config_value)

        if input_shapes is None:
            input_shapes = {}  # will use the defaults from DEFAULT_DUMMY_SHAPES

        # Check that inputs match, and order them properly
        dummy_inputs = config.generate_dummy_inputs(framework="pt", **input_shapes)
        device = torch.device(device)
        if device.type == "cuda" and torch.cuda.is_available():
            model.to(device)
            dummy_inputs = tree_map(
                lambda value: value.to(device) if isinstance(value, torch.Tensor) else value, dummy_inputs
            )

        dummy_inputs = config.rename_ambiguous_inputs(dummy_inputs)
        dummy_inputs, dict_inputs = remove_none_from_dummy_inputs(dummy_inputs)

        try:
            # TorchScript used behind OpenVINO conversion. Optimum supports only return_dict=True models for patching,
            # while TorchScript do not support dictionary with values of mixed types (e.g. Tensor and None) in model input/output
            # To handle it, additional wrapper on patcher forward applied.
            # model.config.torchscript = True can not be used for patching, because it overrides return_dict to False
            patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs)
            patched_forward = patcher.patched_forward

            @functools.wraps(patched_forward)
            def ts_patched_forward(*args, **kwargs):
                for i in range(len(dict_inputs)):
                    input_name, keys = dict_inputs[i]
                    tuple_input = kwargs[input_name]
                    input_dict = dict(zip(keys, tuple_input))
                    kwargs[input_name] = input_dict
                outputs = patched_forward(*args, **kwargs)
                return tuple([value if not isinstance(value, list) else tuple(value) for value in outputs.values()])

            patcher.patched_forward = ts_patched_forward

            with patcher:
                if patch_16bit_model:
                    from openvino.frontend.pytorch.patch_model import __make_16bit_traceable

                    __make_16bit_traceable(model)
                check_dummy_inputs_are_allowed(model, dummy_inputs)
                input_info = _get_input_info(model, config, dummy_inputs)
                ov_model = convert_model(
                    model,
                    example_input=dummy_inputs,
                    input=[(item.shape, item.type) for item in input_info],
                )

        except Exception as ex:
            logger.warning(f"Export model to OpenVINO directly failed with: \n{ex}.\nModel will be exported to ONNX")

            if stateful:
                # cannot raise because stateful is enabled by default and it would break backward compatibility for models that couldn't convert to OV directly
                # TODO: Implement stateful for ONNX path as well, not doing it right now because of lack of validation
                logger.warning(
                    "[ WARNING ] Making stateful models is not supported when exporting to ONNX as an intermediate step. "
                    "A stateless model will be exported instead. It may result in sub-optimal inference performance."
                    "Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path."
                )

            if patch_16bit_model:
                from openvino.frontend.pytorch.patch_model import unpatch_model

                unpatch_model(model, "_openvino_module_extension_patch_orig_forward")
                for m in model.modules():
                    if any(p.dtype in [torch.float16, torch.bfloat16] for p in m.parameters(False)) or any(
                        b.dtype in [torch.float16, torch.bfloat16] for b in m.buffers(False)
                    ):
                        m.float()

            return export_pytorch_via_onnx(
                model,
                config,
                opset,
                output,
                device,
                input_shapes,
                model_kwargs,
                ov_config=ov_config,
                library_name=library_name,
            )

        ov_model.validate_nodes_and_infer_types()  # TODO: remove as unnecessary validation?

        output_names = list(config.outputs.keys())
        for idx, out_tensor in enumerate(ov_model.outputs):
            if idx < len(output_names):
                out_tensor.get_tensor().set_names({output_names[idx]})

        input_names = [item.name for item in input_info]
        for idx, inp_tensor in enumerate(ov_model.inputs):
            input_name = input_names[idx]
            inp_tensor.get_tensor().set_names({input_name})

        if stateful:
            patch_stateful(model.config, ov_model)

        library_name = _infer_library_from_model_or_model_class(model=model, library_name=library_name)

        _save_model(
            ov_model,
            output,
            ov_config=ov_config,
            library_name=library_name,
            config=config,
        )
        clear_class_registry()
        del ov_model
        del model
        gc.collect()
    return input_names, output_names, False


def export_models(
    models_and_export_configs: Dict[
        str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin", "DiffusionPipeline"], "OnnxConfig"]
    ],
    output_dir: Path,
    opset: Optional[int] = None,
    output_names: Optional[List[str]] = None,
    device: str = "cpu",
    input_shapes: Optional[Dict] = None,
    model_kwargs: Optional[Dict[str, Any]] = None,
    ov_config: Optional["OVConfig"] = None,
    stateful: bool = True,
    patch_16bit_model: bool = False,
    library_name: Optional[str] = None,
) -> Tuple[List[List[str]], List[List[str]]]:
    """
    Export the models to OpenVINO IR format

    Args:
        models_and_export_configs (Dict[ str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]):
        output_dir (Path): output directory for saving models
        opset (Optional[int], optional, Default to None): ONNX export opset
        output_names (Optional[List[str]], optional, Defaults to None): model output names
        device (str, optional, Defaults to "cpu"):
            The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
            export on CUDA devices.
        input_shapes (Optional[Dict], optional, Defaults to None):
            If specified, allows to use specific shapes for the example input provided to the exporter.
        ov_config (`OVConfig`, *optional*):
            The configuration containing the parameters related to quantization.
        model_kwargs (Optional[Dict[str, Any]], optional):
            Additional kwargs for model export.
        stateful (`bool`, defaults to `True`)
            Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models.

    Raises:
        ValueError: if custom names set not equal of number of models

    Returns:
        list of input_names and output_names from ONNX configuration
    """

    outputs = []

    if output_names is not None and len(output_names) != len(models_and_export_configs):
        raise ValueError(
            f"Provided custom names {output_names} for the export of {len(models_and_export_configs)} models. Please provide the same number of names as models to export."
        )

    for i, model_name in enumerate(models_and_export_configs.keys()):
        submodel, sub_export_config = models_and_export_configs[model_name]
        output_name = output_names[i] if output_names is not None else Path(model_name + ".xml")
        output_path = output_dir / output_name
        output_path.parent.mkdir(parents=True, exist_ok=True)
        outputs.append(
            export(
                model=submodel,
                config=sub_export_config,
                output=output_path,
                opset=opset,
                device=device,
                input_shapes=input_shapes,
                model_kwargs=model_kwargs,
                ov_config=ov_config,
                stateful=stateful[i] if isinstance(stateful, (list, tuple)) else stateful,
                patch_16bit_model=patch_16bit_model,
                library_name=library_name,
            )
        )

    outputs = list(map(list, zip(*outputs)))
    return outputs


def export_from_model(
    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin", "DiffusionPipeline"],
    output: Union[str, Path],
    task: Optional[str] = None,
    ov_config: Optional["OVConfig"] = None,
    stateful: bool = True,
    opset: Optional[int] = None,
    model_kwargs: Optional[Dict[str, Any]] = None,
    custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None,
    fn_get_submodels: Optional[Callable] = None,
    preprocessors: List = None,
    device: str = "cpu",
    trust_remote_code: bool = False,
    patch_16bit_model: bool = False,
    **kwargs_shapes,
):
    model_kwargs = model_kwargs or {}

    if ov_config is not None and ov_config.quantization_config and not is_nncf_available():
        raise ImportError(
            f"Compression of the weights to {ov_config.quantization_config} requires nncf, please install it with `pip install nncf`"
        )

    library_name = _infer_library_from_model_or_model_class(model)
    if library_name != "open_clip":
        TasksManager.standardize_model_attributes(model)

    if hasattr(model.config, "export_model_type"):
        model_type = model.config.export_model_type.replace("_", "-")
    else:
        model_type = model.config.model_type.replace("_", "-")

    custom_architecture = library_name == "transformers" and model_type not in TasksManager._SUPPORTED_MODEL_TYPE

    if task is not None and task != "auto":
        task = TasksManager.map_from_synonym(task)
    else:
        try:
            task = TasksManager._infer_task_from_model_or_model_class(model=model)
        except (ValueError, KeyError) as e:
            raise RuntimeError(
                f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
            )

        if (
            not custom_architecture
            and library_name != "diffusers"
            and task + "-with-past"
            in TasksManager.get_supported_tasks_for_model_type(model_type, "openvino", library_name=library_name)
        ):
            # -with-past is the default.
            task = task + "-with-past"

        logger.info(f"Automatic task detection to: {task}.")

    stateful = stateful and (
        ensure_export_task_support_stateful(task)
        or ensure_model_type_support_stateful(getattr(getattr(model, "config", {}), "model_type", ""))
    )
    # TODO: support onnx_config.py in the model repo
    if custom_architecture and custom_export_configs is None:
        raise ValueError(
            f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export."
        )

    if task.startswith("text-generation") and model.config.is_encoder_decoder:
        raise ValueError(
            f"model.config.is_encoder_decoder is True and task is `{task}`, which are incompatible. If the task was auto-inferred, please fill a bug report"
            f"at https://github.com/huggingface/optimum, if --task was explicitely passed, make sure you selected the right task for the model,"
            f" referring to `optimum.exporters.tasks.TaskManager`'s `_TRANSFORMERS_TASKS_TO_MODEL_LOADERS`."
        )
    if library_name != "diffusers" and model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE:
        raise ValueError(
            f"{model_type} is not supported yet. Only {list(TasksManager._SUPPORTED_CLI_MODEL_TYPE.keys())} are supported. "
            f"If you want to support {model_type} please propose a PR or open up an issue."
        )

    output = Path(output)
    if not output.exists():
        output.mkdir(parents=True)

    # Get the shapes to be used to generate dummy inputs
    input_shapes = {}
    for input_name in DEFAULT_DUMMY_SHAPES.keys():
        input_shapes[input_name] = (
            kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name]
        )

    if library_name == "open_clip":
        custom_architecture = True
        custom_export_configs, fn_get_submodels = _get_open_clip_submodels_fn_and_export_configs(
            model, library_name, task, preprocessors, custom_export_configs, fn_get_submodels
        )

    if library_name == "diffusers":
        export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino")
        stateful_submodels = False
    else:
        logging.disable(logging.INFO)
        export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs(
            model=model,
            task=task,
            monolith=False,
            custom_export_configs=custom_export_configs if custom_export_configs is not None else {},
            custom_architecture=custom_architecture,
            fn_get_submodels=fn_get_submodels,
            preprocessors=preprocessors,
            library_name=library_name,
            model_kwargs=model_kwargs,
            _variant="default",
            legacy=False,
            exporter="openvino",
            stateful=stateful,
        )
        logging.disable(logging.NOTSET)

    if library_name == "open_clip":
        if hasattr(model.config, "save_pretrained"):
            model.config.save_pretrained(output)

        for preprocess in preprocessors:
            if hasattr(preprocess, "save_pretrained"):
                preprocess.save_pretrained(output)

        files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
    elif library_name != "diffusers":
        if is_transformers_version(">=", "4.44.99"):
            # some model configs may have issues with loading without parameters initialization
            try:
                misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
            except (KeyError, TypeError):
                misplaced_generation_parameters = {}
            if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
                logger.warning(
                    "Moving the following attributes in the config to the generation config: "
                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
                    "generation parameters in the model config, as opposed to in the generation config.",
                )
                for param_name, param_value in misplaced_generation_parameters.items():
                    setattr(model.generation_config, param_name, param_value)
                    setattr(model.config, param_name, None)

        # Saving the model config and preprocessor as this is needed sometimes.
        save_config(model.config, output)
        generation_config = getattr(model, "generation_config", None)
        if generation_config is not None:
            try:
                generation_config.save_pretrained(output)
            except Exception as exception:
                logger.warning(
                    f"The generation config will not be saved, saving failed with following error:\n{exception}"
                )

        save_preprocessors(preprocessors, model.config, output, trust_remote_code)

        files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]

    else:
        # save the subcomponent configuration
        for model_name in models_and_export_configs:
            subcomponent = models_and_export_configs[model_name][0]
            if hasattr(subcomponent, "save_config"):
                subcomponent.save_config(output / model_name)
            elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"):
                subcomponent.config.save_pretrained(output / model_name)

        files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_export_configs]

        # Saving the additional components needed to perform inference.
        model.scheduler.save_pretrained(output.joinpath("scheduler"))

        feature_extractor = getattr(model, "feature_extractor", None)
        if feature_extractor is not None:
            feature_extractor.save_pretrained(output.joinpath("feature_extractor"))

        tokenizer = getattr(model, "tokenizer", None)
        if tokenizer is not None:
            tokenizer.save_pretrained(output.joinpath("tokenizer"))

        tokenizer_2 = getattr(model, "tokenizer_2", None)
        if tokenizer_2 is not None:
            tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))

        tokenizer_3 = getattr(model, "tokenizer_3", None)
        if tokenizer_3 is not None:
            tokenizer_3.save_pretrained(output.joinpath("tokenizer_3"))
        safety_checker = getattr(model, "safety_checker", None)
        if safety_checker is not None:
            safety_checker.save_pretrained(output.joinpath("safety_checker"))

        model.save_config(output)

    _set_runtime_options(
        models_and_export_configs,
        task,
        library_name,
        hasattr(ov_config, "quantization_config") and ov_config.quantization_config,
    )

    export_models(
        models_and_export_configs=models_and_export_configs,
        output_dir=output,
        output_names=files_subpaths,
        input_shapes=input_shapes,
        device=device,
        ov_config=ov_config,
        stateful=stateful_submodels,
        opset=opset,
        model_kwargs=model_kwargs,
        patch_16bit_model=patch_16bit_model,
        library_name=library_name,
    )

    return files_subpaths


def export_tokenizer(
    tokenizer,
    output: Union[str, Path],
    suffix: Optional[str] = "",
    task: Optional[str] = None,
):
    # avoid circular imports
    from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME
    from optimum.intel.openvino.utils import maybe_convert_tokenizer_to_fast

    try:
        from openvino_tokenizers import convert_tokenizer
    except ModuleNotFoundError:
        return

    if is_tokenizers_version(">", "0.19") and is_openvino_tokenizers_version("<", "2024.5.0.0"):
        logger.warning(
            "Exporting tokenizers to OpenVINO is not supported for tokenizers version > 0.19 and openvino version <= 2024.4. "
            "Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO."
        )

    if not isinstance(output, Path):
        output = Path(output)

    if output.exists():
        tokenizer = maybe_convert_tokenizer_to_fast(tokenizer, output)

    if (
        task is not None
        and task.startswith("text-generation")
        and compare_versions("openvino-tokenizers", ">=", "2024.3.0.0")
    ):
        logger.info(f"Set tokenizer padding side to left for `{task}` task.")
        tokenizer.padding_side = "left"
        tokenizer.truncation_side = "left"

    try:
        converted = convert_tokenizer(tokenizer, with_detokenizer=True)
    except NotImplementedError:
        logger.info("Detokenizer is not supported, convert tokenizer only.")
        converted = convert_tokenizer(tokenizer, with_detokenizer=False)
    except OVTypeError:
        logger.debug(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.")
        return
    except Exception as exception:
        logger.debug(
            f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported. Exception: {exception}"
        )
        return

    if not isinstance(converted, tuple):
        converted = (converted,)

    for model, file_name in zip(converted, (OV_TOKENIZER_NAME, OV_DETOKENIZER_NAME)):
        save_model(model, output / file_name.format(suffix))


def _add_runtime_options_to_rt_info(model: Model, options: Dict):
    """
    Add runtime optinos
    """
    try:
        for name, value in options.items():
            model.set_rt_info(value, ["runtime_options", name])
    except Exception:
        pass

    return model


def _add_version_info_to_model(model: Model, library_name: Optional[str] = None):
    """
    Add dependency versions to OpenVINO model
    """
    try:
        model.set_rt_info(_transformers_version, ["optimum", "transformers_version"])
        model.set_rt_info(_torch_version, ["optimum", "pytorch_version"])
        model.set_rt_info(_optimum_intel_version, ["optimum", "optimum_intel_version"])
        model.set_rt_info(_optimum_version, ["optimum", "optimum_version"])

        if any("token_embeddings" in output.get_names() for output in model.outputs):
            import sentence_transformers

            model.set_rt_info(sentence_transformers.__version__, ["optimum", "sentence_transformers_version"])
        if library_name == "diffusers":
            model.set_rt_info(_diffusers_version, ["optimum", "diffusers_version"])
        elif library_name == "timm":
            model.set_rt_info(_timm_version, ["optimum", "timm_version"])
        elif library_name == "open_clip":
            model.set_rt_info(_open_clip_version, ["optimum", "open_clip_version"])
        rt_info = model.get_rt_info()
        if "nncf" in rt_info:
            model.set_rt_info(_nncf_version, ["optimum", "nncf_version"])
        input_model = rt_info["conversion_parameters"].get("input_model", None)
        if input_model is not None and "onnx" in input_model.value:
            import onnx

            model.set_rt_info(onnx.__version__, ["optimum", "onnx_version"])

    except Exception:
        pass

    return model


def _get_multi_modal_submodels_and_export_configs(
    model: Union["PreTrainedModel", "TFPreTrainedModel"],
    task: str,
    library_name: str,
    int_dtype: str,
    float_dtype: str,
    preprocessors: Optional[List[Any]] = None,
    model_kwargs: Optional[Dict] = None,
    stateful: bool = True,
):
    models_for_export = {}
    stateful_parts = []

    model_type = model.config.model_type.replace("_", "-")

    if model_type == "internvl-chat" and preprocessors is not None:
        model.config.img_context_token_id = preprocessors[0].convert_tokens_to_ids("<IMG_CONTEXT>")

    if model_type == "phi3-v":
        model.config.glb_GN = model.model.vision_embed_tokens.glb_GN.tolist()
        model.config.sub_GN = model.model.vision_embed_tokens.sub_GN.tolist()

    if hasattr(model, "image_newline"):
        model.config.image_newline = model.image_newline.tolist()
    main_config_cls = TasksManager.get_exporter_config_constructor(
        model=model, task=task, exporter="openvino", library_name=library_name
    )
    main_config = main_config_cls(
        model.config, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors
    )
    for behavior in main_config.SUPPORTED_BEHAVIORS:
        model_id = f"{behavior}_model"
        model_part_config = main_config.with_behavior(behavior)
        model_part = main_config.get_model_for_behavior(model, behavior)
        models_for_export[model_id] = (model_part, model_part_config)
        stateful_parts.append(stateful if getattr(model_part_config, "use_past", False) else False)
    return main_config, models_for_export, stateful_parts


def _get_submodels_and_export_configs(
    model: Union["PreTrainedModel", "TFPreTrainedModel", "DiffusionPipeline"],
    task: str,
    monolith: bool,
    custom_export_configs: Dict,
    custom_architecture: bool,
    _variant: str,
    library_name: str,
    int_dtype: str = "int64",
    float_dtype: str = "fp32",
    fn_get_submodels: Optional[Callable] = None,
    preprocessors: Optional[List[Any]] = None,
    legacy: bool = False,
    model_kwargs: Optional[Dict] = None,
    exporter: str = "openvino",
    stateful: bool = False,
):
    if (
        not custom_architecture
        and library_name == "transformers"
        and model.config.model_type.replace("_", "-") in MULTI_MODAL_TEXT_GENERATION_MODELS
    ):
        return _get_multi_modal_submodels_and_export_configs(
            model, task, library_name, int_dtype, float_dtype, preprocessors, model_kwargs, stateful
        )

    export_config, models_for_export = _default_get_submodels_and_export_configs(
        model,
        task,
        monolith,
        custom_export_configs,
        custom_architecture,
        _variant,
        library_name,
        int_dtype,
        float_dtype,
        fn_get_submodels,
        preprocessors,
        legacy,
        model_kwargs,
        exporter,
    )
    stateful_per_model = [stateful] * len(models_for_export)
    return export_config, models_for_export, stateful_per_model


def get_diffusion_models_for_export_ext(
    pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino"
):
    if is_diffusers_version(">=", "0.29.0"):
        from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline

        sd3_pipes = [StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline]
        if is_diffusers_version(">=", "0.30.0"):
            from diffusers import StableDiffusion3InpaintPipeline

            sd3_pipes.append(StableDiffusion3InpaintPipeline)

        is_sd3 = isinstance(pipeline, tuple(sd3_pipes))
    else:
        is_sd3 = False

    if is_diffusers_version(">=", "0.30.0"):
        from diffusers import FluxPipeline

        flux_pipes = [FluxPipeline]

        if is_diffusers_version(">=", "0.31.0"):
            from diffusers import FluxImg2ImgPipeline, FluxInpaintPipeline

            flux_pipes.extend([FluxPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline])

        if is_diffusers_version(">=", "0.32.0"):
            from diffusers import FluxFillPipeline

            flux_pipes.append(FluxFillPipeline)

        is_flux = isinstance(pipeline, tuple(flux_pipes))
    else:
        is_flux = False

    if not is_sd3 and not is_flux:
        return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
    if is_sd3:
        models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
    else:
        models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)

    return None, models_for_export


def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
    models_for_export = {}

    # Text encoder
    text_encoder = getattr(pipeline, "text_encoder", None)
    if text_encoder is not None:
        text_encoder.config.output_hidden_states = True
        text_encoder.text_model.config.output_hidden_states = True
        text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
            model=text_encoder,
            exporter=exporter,
            library_name="diffusers",
            task="feature-extraction",
            model_type="clip-text-with-projection",
        )
        text_encoder_export_config = text_encoder_config_constructor(
            pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
        )
        models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config)

    transformer = pipeline.transformer
    transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim
    transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
    transformer.config.time_cond_proj_dim = None
    export_config_constructor = TasksManager.get_exporter_config_constructor(
        model=transformer,
        exporter=exporter,
        library_name="diffusers",
        task="semantic-segmentation",
        model_type="sd3-transformer",
    )
    transformer_export_config = export_config_constructor(
        pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
    )
    models_for_export["transformer"] = (transformer, transformer_export_config)

    # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
    vae_encoder = copy.deepcopy(pipeline.vae)
    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
    vae_config_constructor = TasksManager.get_exporter_config_constructor(
        model=vae_encoder,
        exporter=exporter,
        library_name="diffusers",
        task="semantic-segmentation",
        model_type="vae-encoder",
    )
    vae_encoder_export_config = vae_config_constructor(
        vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
    )
    models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config)

    # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
    vae_decoder = copy.deepcopy(pipeline.vae)
    vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
    vae_config_constructor = TasksManager.get_exporter_config_constructor(
        model=vae_decoder,
        exporter=exporter,
        library_name="diffusers",
        task="semantic-segmentation",
        model_type="vae-decoder",
    )
    vae_decoder_export_config = vae_config_constructor(
        vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype
    )
    models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config)

    text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
    if text_encoder_2 is not None:
        text_encoder_2.config.output_hidden_states = True
        text_encoder_2.text_model.config.output_hidden_states = True
        export_config_constructor = TasksManager.get_exporter_config_constructor(
            model=text_encoder_2,
            exporter=exporter,
            library_name="diffusers",
            task="feature-extraction",
            model_type="clip-text-with-projection",
        )
        export_config = export_config_constructor(text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype)
        models_for_export["text_encoder_2"] = (text_encoder_2, export_config)

    text_encoder_3 = getattr(pipeline, "text_encoder_3", None)
    if text_encoder_3 is not None:
        export_config_constructor = TasksManager.get_exporter_config_constructor(
            model=text_encoder_3,
            exporter=exporter,
            library_name="diffusers",
            task="feature-extraction",
            model_type="t5-encoder-model",
        )
        export_config = export_config_constructor(
            text_encoder_3.config,
            int_dtype=int_dtype,
            float_dtype=float_dtype,
        )
        models_for_export["text_encoder_3"] = (text_encoder_3, export_config)

    return models_for_export


def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype):
    models_for_export = {}

    # Text encoder
    text_encoder = getattr(pipeline, "text_encoder", None)
    if text_encoder is not None:
        text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
            model=text_encoder,
            exporter=exporter,
            library_name="diffusers",
            task="feature-extraction",
            model_type="clip-text-model",
        )
        text_encoder_export_config = text_encoder_config_constructor(
            pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
        )
        models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config)

    transformer = pipeline.transformer
    transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim
    transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
    transformer.config.time_cond_proj_dim = None
    export_config_constructor = TasksManager.get_exporter_config_constructor(
        model=transformer,
        exporter=exporter,
        library_name="diffusers",
        task="semantic-segmentation",
        model_type="flux-transformer",
    )
    transformer_export_config = export_config_constructor(
        pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
    )
    models_for_export["transformer"] = (transformer, transformer_export_config)

    # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
    vae_encoder = copy.deepcopy(pipeline.vae)
    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
    vae_config_constructor = TasksManager.get_exporter_config_constructor(
        model=vae_encoder,
        exporter=exporter,
        library_name="diffusers",
        task="semantic-segmentation",
        model_type="vae-encoder",
    )
    vae_encoder_export_config = vae_config_constructor(
        vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
    )
    models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config)

    # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
    vae_decoder = copy.deepcopy(pipeline.vae)
    vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
    vae_config_constructor = TasksManager.get_exporter_config_constructor(
        model=vae_decoder,
        exporter=exporter,
        library_name="diffusers",
        task="semantic-segmentation",
        model_type="vae-decoder",
    )
    vae_decoder_export_config = vae_config_constructor(
        vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype
    )
    models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config)

    text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
    if text_encoder_2 is not None:
        export_config_constructor = TasksManager.get_exporter_config_constructor(
            model=text_encoder_2,
            exporter=exporter,
            library_name="diffusers",
            task="feature-extraction",
            model_type="t5-encoder-model",
        )
        export_config = export_config_constructor(
            text_encoder_2.config,
            int_dtype=int_dtype,
            float_dtype=float_dtype,
        )
        models_for_export["text_encoder_2"] = (text_encoder_2, export_config)

    return models_for_export