From b11becb7e8c757dd1cb860f34344976d0d34ac31 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 22 Mar 2024 10:30:18 +0400
Subject: [PATCH 1/5] support more models in export

---
 optimum/exporters/openvino/convert.py       |   2 +-
 optimum/exporters/openvino/model_configs.py |  71 ++++++
 optimum/exporters/openvino/model_patcher.py | 264 +++++++++++++++++++-
 3 files changed, 334 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 98dd22d824..ccc046ce55 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -345,7 +345,7 @@ def ts_patched_forward(*args, **kwargs):
                     input_dict = dict(zip(keys, tuple_input))
                     kwargs[input_name] = input_dict
                 outputs = patched_forward(*args, **kwargs)
-                return tuple(outputs.values())
+                return tuple([value if not isinstance(value, list) else tuple(value) for value in outputs.values()])
 
             patcher.patched_forward = ts_patched_forward
 
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index a274b3671d..ddb6223951 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -35,6 +35,7 @@
     ChatGLMModelPatcher,
     GemmaModelPatcher,
     MixtralModelPatcher,
+    OLMoModelPatcher,
     QwenModelPatcher,
 )
 
@@ -400,3 +401,73 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+@register_in_tasks_manager("olmo", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class OLMoOpenVINOConfig(TextDecoderOnnxConfig):
+    # OLMo does not require position_ids input.
+    DEFAULT_ONNX_OPSET = 13
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return OLMoModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        random_sequence_length_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            random_batch_size_range=random_batch_size_range,
+            random_sequence_length_range=random_sequence_length_range,
+        )
+        self.num_key_value_heads_per_layer = normalized_config.num_key_value_heads_per_layer
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        past_key_values = []
+
+        for layer_id in range(self.num_layers):
+            shape = (
+                self.batch_size,
+                self.num_key_value_heads_per_layer[layer_id],
+                self.sequence_length,
+                self.hidden_size // self.num_attention_heads,
+            )
+            past_key_values.append(
+                (
+                    self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+                    self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+                )
+            )
+        return past_key_values
+
+
+@register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 2cedf64b0a..9a78a23e9d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -13,8 +13,9 @@
 #  limitations under the License.
 
 import logging as log
+import math
 import types
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -513,5 +514,264 @@ def __init__(
     ):
         super().__init__(config, model, model_kwargs)
         # model has first inference buffers initialization
-        if self._model.lm_head.first_flag:
+        if hasattr(self._model.lm_head, "first_flag"):
             self._model(torch.ones((1, 10), dtype=torch.int64), torch.ones((1, 10), dtype=torch.int64))
+
+
+class OlmoOutput(NamedTuple):
+    logits: torch.FloatTensor
+    """
+    A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
+    for the next token *before* normalization via (log) softmax.
+    """
+
+    attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]
+    """
+    Attention keys and values from each block.
+    """
+
+    hidden_states: Optional[Tuple[torch.Tensor]]
+    """
+    Hidden states from each block.
+    """
+
+
+def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
+    """
+    Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
+    is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
+    """
+    if check_neg_inf:
+        x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
+    if check_pos_inf:
+        x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
+
+
+def _olmo_model_forward(
+    self,
+    input_ids: torch.LongTensor,
+    input_embeddings: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    attention_bias: Optional[torch.Tensor] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]] = None,
+    use_cache: bool = False,
+    last_logits_only: bool = False,
+    output_hidden_states: Optional[bool] = None,
+):
+    output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+
+    if past_key_values:
+        assert len(past_key_values) == self.config.n_layers
+
+    batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
+    if past_key_values is None:
+        past_length = 0
+    else:
+        past_length = past_key_values[0][0].size(-2)
+
+    # Get embeddings of input.
+    # shape: (batch_size, seq_len, d_model)
+    x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings  # type: ignore
+
+    if not (self.config.alibi or self.config.rope):
+        # Get positional embeddings.
+        # shape: (1, seq_len)
+        pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
+        # shape: (1, seq_len, d_model)
+        pos_emb = self.transformer.wpe(pos)  # type: ignore
+        x = pos_emb + x
+
+    # Add input + positional embeddings and apply dropout.
+    # shape: (batch_size, seq_len, d_model)
+    x = self.transformer.emb_drop(x)  # type: ignore
+
+    # Transform the attention mask into what the blocks expect.
+    if attention_mask is not None:
+        # shape: (batch_size, 1, 1, seq_len)
+        attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
+        attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
+
+    # Merge attention mask with attention bias.
+    if attention_bias is not None or attention_mask is not None or self.config.alibi or past_key_values is not None:
+        if attention_bias is None and self.config.alibi:
+            attention_bias = self.get_causal_attention_bias(
+                past_length + seq_len, x.device
+            ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
+        elif attention_bias is None:
+            attention_bias = self.get_causal_attention_bias(past_length + seq_len, x.device)
+        elif attention_bias.dtype in (torch.int8, torch.bool):
+            attention_bias = attention_bias.to(dtype=torch.float)
+            attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)
+
+        # Transform to the right shape and data type.
+        mask_len = seq_len
+        if attention_mask is not None:
+            mask_len = attention_mask.shape[-1]
+        elif past_key_values is not None:
+            mask_len = past_key_values[0][0].shape[-2] + seq_len
+        attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float)
+
+        # Add in the masking bias.
+        if attention_mask is not None:
+            attention_bias = attention_bias + attention_mask
+            # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf.
+            # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead
+            # it can produce NaNs.
+            ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False)
+
+    attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+
+    # decoder layers
+    all_hidden_states = []
+
+    # Apply blocks one-by-one.
+    if self.config.block_group_size == 1:
+        for block_idx, block in enumerate(self.transformer.blocks):
+            if output_hidden_states:
+                # add hidden states
+                all_hidden_states.append(x)
+
+            layer_past = None if past_key_values is None else past_key_values[block_idx]
+            # shape: (batch_size, seq_len, d_model)
+            x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
+            if attn_key_values is not None:
+                assert cache is not None
+                attn_key_values.append(cache)
+    else:
+        for group_idx, block_group in enumerate(self.transformer.block_groups):
+            if output_hidden_states:
+                # add hidden states
+                all_hidden_states.append(x)
+
+            layers_past = (
+                None
+                if past_key_values is None
+                else past_key_values[
+                    group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size
+                ]
+            )
+            x, cache = block_group(x, attention_bias=attention_bias, layers_past=layers_past, use_cache=use_cache)
+            if attn_key_values is not None:
+                assert cache is not None
+                attn_key_values.extend(cache)
+
+    if last_logits_only:
+        # shape: (batch_size, 1, d_model)
+        x = x[:, -1, :].unsqueeze(1)
+
+    # Apply final layer norm.
+    # shape: (batch_size, seq_len or 1, d_model)
+    x = self.transformer.ln_f(x)  # type: ignore
+    if output_hidden_states:
+        # add final hidden state post-final-layernorm, following HuggingFace's convention
+        all_hidden_states.append(x)
+
+    # Get logits.
+    # shape: (batch_size, seq_len or 1, vocab_size)
+    if self.config.weight_tying:
+        logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
+    else:
+        logits = self.transformer.ff_out(x)  # type: ignore
+    if self.config.scale_logits:
+        logits.mul_(1 / math.sqrt(self.config.d_model))
+
+    return OlmoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
+
+
+def _olmo_causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
+    att_bias = torch.triu(
+        torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
+        diagonal=1,
+    )
+    att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
+    return att_bias.view(1, 1, seq_len, seq_len)  # type: ignore
+
+
+def _olmo_get_causal_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor:
+    if hasattr(self, "causal_bias") and self.causal_bias.shape[-1] >= seq_len:
+        return self.causal_bias.to(device)
+    with torch.autocast(device.type, enabled=False):
+        causal_bias = _olmo_causal_attention_bias(seq_len, device)
+        self.register_buffer("causal_bias", causal_bias)
+    return causal_bias
+
+
+def _olmo_alibi_attention_bias(seq_len: int, config, device: torch.device) -> torch.FloatTensor:
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, 1, seq_len)
+    """
+    A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
+    for the next token *before* normalization via (log) softmax.
+    """
+    # shape: (1, 1, seq_len, seq_len)
+    alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, seq_len, 1)
+    alibi_bias.abs_().mul_(-1)
+
+    # shape: (n_heads,)
+    m = torch.arange(1, config.n_heads + 1, dtype=torch.float, device=device)
+    m.mul_(config.alibi_bias_max / config.n_heads)
+
+    # shape: (1, n_heads, seq_len, seq_len)
+    return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1)))  # type: ignore
+
+
+def _olmo_get_alibi_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor:
+    alibi_bias = getattr(self, "alibi_attention_bias", None)
+    if alibi_bias is not None and alibi_bias.shape[-1] >= seq_len:
+        if alibi_bias.device != device:
+            alibi_bias = alibi_bias.to(device)
+        return alibi_bias
+    with torch.autocast(device.type, enabled=False):
+        alibi_bias = _olmo_alibi_attention_bias(seq_len, self.config, device)
+        self.register_buffer("alibi_attention_bias", alibi_bias)
+    return alibi_bias
+
+
+def _olmo_get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+    if (
+        hasattr(self, "rope_pos_sin")
+        and hasattr(self, "rope_pos_cos")
+        and self.rope_pos_sin.shape[-2] >= seq_len
+        and self.rope_pos_cos.shape[-2] >= seq_len
+    ):
+        return self.rope_pos_sin.to(device)[:, :, :seq_len, :], self.rope_pos_sin.to(device)[:, :, :seq_len, :]
+
+    with torch.autocast(device.type, enabled=False):
+        dim = self.config.d_model // self.config.n_heads
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
+        seq = torch.arange(seq_len, device=device, dtype=torch.float)
+        freqs = torch.einsum("i , j -> i j", seq, inv_freq)
+        positions = torch.cat((freqs, freqs), dim=-1)
+        pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
+
+    self.register_buffer("rope_pos_sin", pos_sin)
+    self.register_buffer("rope_pos_cos", pos_cos)
+    return pos_sin, pos_cos
+
+
+class OLMoModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        # model uses custom cache buffers for storing rotary_embeddings and attention biases.
+        # these objects are nontracable, replace them with standard torch tensors during export
+        self._model.model._orig_forward = self._model.model.forward
+        self._model.model._orig_get_alibi_attention_bias = self._model.model.get_alibi_attention_bias
+        self._model.model.forward = types.MethodType(_olmo_model_forward, self._model.model)
+        self._model.model.get_alibi_attention_bias = types.MethodType(
+            _olmo_get_alibi_attention_bias, self._model.model
+        )
+        self._model.model.get_alibi_attention_bias(self._model.config.max_sequence_length, torch.device("cpu"))
+        self._model.model.get_causal_attention_bias = types.MethodType(
+            _olmo_get_causal_attention_bias, self._model.model
+        )
+        self._model.model.get_causal_attention_bias(self._model.config.max_sequence_length, torch.device("cpu"))
+        for block in self._model.model.transformer.blocks:
+            block.rotary_emb._orig_get_rotary_embedding = block.rotary_emb.get_rotary_embedding
+            block.rotary_emb.get_rotary_embedding = types.MethodType(_olmo_get_rotary_embedding, block.rotary_emb)
+            block.rotary_emb.get_rotary_embedding(self._model.config.max_sequence_length, torch.device("cpu"))
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.model.forward = self._model.model._orig_forward
+        self._model.model.get_alibi_attention_bias = self._model.model._orig_get_alibi_attention_bias
+        for block in self._model.model.transformer.blocks:
+            block.rotary_emb.get_rotary_embedding = block.rotary_emb._orig_get_rotary_embedding

From 1e6f0b2833e6495dcbb491bdcd5a2d1a9ce870c3 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 25 Mar 2024 18:21:52 +0400
Subject: [PATCH 2/5] add orion

---
 optimum/exporters/openvino/__main__.py      |  2 ++
 optimum/exporters/openvino/model_configs.py | 11 ++++++++++-
 tests/openvino/test_modeling.py             |  3 ++-
 tests/openvino/utils_tests.py               |  2 ++
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 5d6e31ebac..2f0bc1350f 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -203,6 +203,8 @@ def main_export(
         do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
         model_type = config.model_type.replace("_", "-")
 
+        if model_type in {"falcon", "mpt"} and trust_remote_code:
+            trust_remote_code = False
         if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
             custom_architecture = True
         elif task not in TasksManager.get_supported_tasks_for_model_type(
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index ddb6223951..d7b23d1238 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -75,7 +75,7 @@ def init_model_configs():
 
 
 @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers")
-class BaichaunOpenVINOConfig(TextDecoderOnnxConfig):
+class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
         num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size"
@@ -471,3 +471,12 @@ class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+@register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index f54305113f..70c6ceab5d 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -524,10 +524,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "stablelm",
         "starcoder2",
         "phi",
+        "internlm2",
     )
     GENERATION_LENGTH = 100
     IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3")
-    REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen")
+    REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "decilm")
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index c95444274e..bb11ad7baa 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -36,6 +36,7 @@
     "data2vec_audio": "hf-internal-testing/tiny-random-Data2VecAudioModel",
     "deberta": "hf-internal-testing/tiny-random-deberta",
     "deberta_v2": "hf-internal-testing/tiny-random-DebertaV2Model",
+    "decilm": "katuni4ka/tiny-random-deciml",
     "deit": "hf-internal-testing/tiny-random-deit",
     "convnext": "hf-internal-testing/tiny-random-convnext",
     "distilbert": "hf-internal-testing/tiny-random-distilbert",
@@ -50,6 +51,7 @@
     "gptj": "hf-internal-testing/tiny-random-GPTJModel",
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-ibert",
+    "internlm2": "katuni4ka/tiny-random-internlm2",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
     "longt5": "hf-internal-testing/tiny-random-longt5",
     "llama": "fxmarty/tiny-llama-fast-tokenizer",

From 22141ef980d7f46163000f3d1898e82e02783f06 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 5 Apr 2024 14:38:34 +0400
Subject: [PATCH 3/5] update tests

---
 optimum/exporters/openvino/__main__.py      |  17 +-
 optimum/exporters/openvino/model_configs.py |  62 -----
 optimum/exporters/openvino/model_patcher.py | 262 +-------------------
 tests/openvino/test_modeling.py             |   3 +-
 tests/openvino/utils_tests.py               |   3 +-
 5 files changed, 19 insertions(+), 328 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 2f0bc1350f..856ea6798f 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -202,9 +202,6 @@ def main_export(
         quantization_config = getattr(config, "quantization_config", None)
         do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
         model_type = config.model_type.replace("_", "-")
-
-        if model_type in {"falcon", "mpt"} and trust_remote_code:
-            trust_remote_code = False
         if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
             custom_architecture = True
         elif task not in TasksManager.get_supported_tasks_for_model_type(
@@ -222,6 +219,20 @@ def main_export(
             )
         if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
             loading_kwargs["attn_implementation"] = "eager"
+        # there are some difference between remote and in library representation of past key values for some models,
+        # for avoiding confusion we disable remote code for them
+        if (
+            trust_remote_code
+            and model_type in {"falcon", "mpt", "phi"}
+            and ("with-past" in task or original_task == "auto")
+            and not custom_export_configs
+        ):
+            logger.warning(
+                "Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`"
+                "using default export configuration, `trust_remote_code` will be disabled. "
+                "Please provide custon export config if you want load model with remote code."
+            )
+            trust_remote_code = False
 
     # Patch the modules to export of GPTQ models w/o GPU
     if do_gptq_patching:
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index d7b23d1238..6f22cf2142 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -35,7 +35,6 @@
     ChatGLMModelPatcher,
     GemmaModelPatcher,
     MixtralModelPatcher,
-    OLMoModelPatcher,
     QwenModelPatcher,
 )
 
@@ -403,18 +402,6 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
-@register_in_tasks_manager("olmo", *["text-generation", "text-generation-with-past"], library_name="transformers")
-class OLMoOpenVINOConfig(TextDecoderOnnxConfig):
-    # OLMo does not require position_ids input.
-    DEFAULT_ONNX_OPSET = 13
-    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
-
-    def patch_model_for_export(
-        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
-    ) -> "ModelPatcher":
-        return OLMoModelPatcher(self, model, model_kwargs=model_kwargs)
-
-
 @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
@@ -424,55 +411,6 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
-class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
-    def __init__(
-        self,
-        task: str,
-        normalized_config: NormalizedTextConfig,
-        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
-        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
-        random_batch_size_range: Optional[Tuple[int, int]] = None,
-        random_sequence_length_range: Optional[Tuple[int, int]] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            task=task,
-            normalized_config=normalized_config,
-            batch_size=batch_size,
-            sequence_length=sequence_length,
-            random_batch_size_range=random_batch_size_range,
-            random_sequence_length_range=random_sequence_length_range,
-        )
-        self.num_key_value_heads_per_layer = normalized_config.num_key_value_heads_per_layer
-
-    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        past_key_values = []
-
-        for layer_id in range(self.num_layers):
-            shape = (
-                self.batch_size,
-                self.num_key_value_heads_per_layer[layer_id],
-                self.sequence_length,
-                self.hidden_size // self.num_attention_heads,
-            )
-            past_key_values.append(
-                (
-                    self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
-                    self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
-                )
-            )
-        return past_key_values
-
-
-@register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers")
-class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 14
-
-    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator)
-    DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator
-    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
-
-
 @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 9a78a23e9d..bafd467dd4 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -13,9 +13,8 @@
 #  limitations under the License.
 
 import logging as log
-import math
 import types
-from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -516,262 +515,3 @@ def __init__(
         # model has first inference buffers initialization
         if hasattr(self._model.lm_head, "first_flag"):
             self._model(torch.ones((1, 10), dtype=torch.int64), torch.ones((1, 10), dtype=torch.int64))
-
-
-class OlmoOutput(NamedTuple):
-    logits: torch.FloatTensor
-    """
-    A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
-    for the next token *before* normalization via (log) softmax.
-    """
-
-    attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]
-    """
-    Attention keys and values from each block.
-    """
-
-    hidden_states: Optional[Tuple[torch.Tensor]]
-    """
-    Hidden states from each block.
-    """
-
-
-def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
-    """
-    Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
-    is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
-    """
-    if check_neg_inf:
-        x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
-    if check_pos_inf:
-        x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
-
-
-def _olmo_model_forward(
-    self,
-    input_ids: torch.LongTensor,
-    input_embeddings: Optional[torch.FloatTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    attention_bias: Optional[torch.Tensor] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]] = None,
-    use_cache: bool = False,
-    last_logits_only: bool = False,
-    output_hidden_states: Optional[bool] = None,
-):
-    output_hidden_states = output_hidden_states if output_hidden_states is not None else False
-
-    if past_key_values:
-        assert len(past_key_values) == self.config.n_layers
-
-    batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
-    if past_key_values is None:
-        past_length = 0
-    else:
-        past_length = past_key_values[0][0].size(-2)
-
-    # Get embeddings of input.
-    # shape: (batch_size, seq_len, d_model)
-    x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings  # type: ignore
-
-    if not (self.config.alibi or self.config.rope):
-        # Get positional embeddings.
-        # shape: (1, seq_len)
-        pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
-        # shape: (1, seq_len, d_model)
-        pos_emb = self.transformer.wpe(pos)  # type: ignore
-        x = pos_emb + x
-
-    # Add input + positional embeddings and apply dropout.
-    # shape: (batch_size, seq_len, d_model)
-    x = self.transformer.emb_drop(x)  # type: ignore
-
-    # Transform the attention mask into what the blocks expect.
-    if attention_mask is not None:
-        # shape: (batch_size, 1, 1, seq_len)
-        attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
-        attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
-
-    # Merge attention mask with attention bias.
-    if attention_bias is not None or attention_mask is not None or self.config.alibi or past_key_values is not None:
-        if attention_bias is None and self.config.alibi:
-            attention_bias = self.get_causal_attention_bias(
-                past_length + seq_len, x.device
-            ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
-        elif attention_bias is None:
-            attention_bias = self.get_causal_attention_bias(past_length + seq_len, x.device)
-        elif attention_bias.dtype in (torch.int8, torch.bool):
-            attention_bias = attention_bias.to(dtype=torch.float)
-            attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)
-
-        # Transform to the right shape and data type.
-        mask_len = seq_len
-        if attention_mask is not None:
-            mask_len = attention_mask.shape[-1]
-        elif past_key_values is not None:
-            mask_len = past_key_values[0][0].shape[-2] + seq_len
-        attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float)
-
-        # Add in the masking bias.
-        if attention_mask is not None:
-            attention_bias = attention_bias + attention_mask
-            # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf.
-            # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead
-            # it can produce NaNs.
-            ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False)
-
-    attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
-
-    # decoder layers
-    all_hidden_states = []
-
-    # Apply blocks one-by-one.
-    if self.config.block_group_size == 1:
-        for block_idx, block in enumerate(self.transformer.blocks):
-            if output_hidden_states:
-                # add hidden states
-                all_hidden_states.append(x)
-
-            layer_past = None if past_key_values is None else past_key_values[block_idx]
-            # shape: (batch_size, seq_len, d_model)
-            x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
-            if attn_key_values is not None:
-                assert cache is not None
-                attn_key_values.append(cache)
-    else:
-        for group_idx, block_group in enumerate(self.transformer.block_groups):
-            if output_hidden_states:
-                # add hidden states
-                all_hidden_states.append(x)
-
-            layers_past = (
-                None
-                if past_key_values is None
-                else past_key_values[
-                    group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size
-                ]
-            )
-            x, cache = block_group(x, attention_bias=attention_bias, layers_past=layers_past, use_cache=use_cache)
-            if attn_key_values is not None:
-                assert cache is not None
-                attn_key_values.extend(cache)
-
-    if last_logits_only:
-        # shape: (batch_size, 1, d_model)
-        x = x[:, -1, :].unsqueeze(1)
-
-    # Apply final layer norm.
-    # shape: (batch_size, seq_len or 1, d_model)
-    x = self.transformer.ln_f(x)  # type: ignore
-    if output_hidden_states:
-        # add final hidden state post-final-layernorm, following HuggingFace's convention
-        all_hidden_states.append(x)
-
-    # Get logits.
-    # shape: (batch_size, seq_len or 1, vocab_size)
-    if self.config.weight_tying:
-        logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
-    else:
-        logits = self.transformer.ff_out(x)  # type: ignore
-    if self.config.scale_logits:
-        logits.mul_(1 / math.sqrt(self.config.d_model))
-
-    return OlmoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
-
-
-def _olmo_causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
-    att_bias = torch.triu(
-        torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
-        diagonal=1,
-    )
-    att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
-    return att_bias.view(1, 1, seq_len, seq_len)  # type: ignore
-
-
-def _olmo_get_causal_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor:
-    if hasattr(self, "causal_bias") and self.causal_bias.shape[-1] >= seq_len:
-        return self.causal_bias.to(device)
-    with torch.autocast(device.type, enabled=False):
-        causal_bias = _olmo_causal_attention_bias(seq_len, device)
-        self.register_buffer("causal_bias", causal_bias)
-    return causal_bias
-
-
-def _olmo_alibi_attention_bias(seq_len: int, config, device: torch.device) -> torch.FloatTensor:
-    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, 1, seq_len)
-    """
-    A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
-    for the next token *before* normalization via (log) softmax.
-    """
-    # shape: (1, 1, seq_len, seq_len)
-    alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, seq_len, 1)
-    alibi_bias.abs_().mul_(-1)
-
-    # shape: (n_heads,)
-    m = torch.arange(1, config.n_heads + 1, dtype=torch.float, device=device)
-    m.mul_(config.alibi_bias_max / config.n_heads)
-
-    # shape: (1, n_heads, seq_len, seq_len)
-    return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1)))  # type: ignore
-
-
-def _olmo_get_alibi_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor:
-    alibi_bias = getattr(self, "alibi_attention_bias", None)
-    if alibi_bias is not None and alibi_bias.shape[-1] >= seq_len:
-        if alibi_bias.device != device:
-            alibi_bias = alibi_bias.to(device)
-        return alibi_bias
-    with torch.autocast(device.type, enabled=False):
-        alibi_bias = _olmo_alibi_attention_bias(seq_len, self.config, device)
-        self.register_buffer("alibi_attention_bias", alibi_bias)
-    return alibi_bias
-
-
-def _olmo_get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
-    if (
-        hasattr(self, "rope_pos_sin")
-        and hasattr(self, "rope_pos_cos")
-        and self.rope_pos_sin.shape[-2] >= seq_len
-        and self.rope_pos_cos.shape[-2] >= seq_len
-    ):
-        return self.rope_pos_sin.to(device)[:, :, :seq_len, :], self.rope_pos_sin.to(device)[:, :, :seq_len, :]
-
-    with torch.autocast(device.type, enabled=False):
-        dim = self.config.d_model // self.config.n_heads
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
-        seq = torch.arange(seq_len, device=device, dtype=torch.float)
-        freqs = torch.einsum("i , j -> i j", seq, inv_freq)
-        positions = torch.cat((freqs, freqs), dim=-1)
-        pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
-
-    self.register_buffer("rope_pos_sin", pos_sin)
-    self.register_buffer("rope_pos_cos", pos_cos)
-    return pos_sin, pos_cos
-
-
-class OLMoModelPatcher(DecoderModelPatcher):
-    def __enter__(self):
-        super().__enter__()
-        # model uses custom cache buffers for storing rotary_embeddings and attention biases.
-        # these objects are nontracable, replace them with standard torch tensors during export
-        self._model.model._orig_forward = self._model.model.forward
-        self._model.model._orig_get_alibi_attention_bias = self._model.model.get_alibi_attention_bias
-        self._model.model.forward = types.MethodType(_olmo_model_forward, self._model.model)
-        self._model.model.get_alibi_attention_bias = types.MethodType(
-            _olmo_get_alibi_attention_bias, self._model.model
-        )
-        self._model.model.get_alibi_attention_bias(self._model.config.max_sequence_length, torch.device("cpu"))
-        self._model.model.get_causal_attention_bias = types.MethodType(
-            _olmo_get_causal_attention_bias, self._model.model
-        )
-        self._model.model.get_causal_attention_bias(self._model.config.max_sequence_length, torch.device("cpu"))
-        for block in self._model.model.transformer.blocks:
-            block.rotary_emb._orig_get_rotary_embedding = block.rotary_emb.get_rotary_embedding
-            block.rotary_emb.get_rotary_embedding = types.MethodType(_olmo_get_rotary_embedding, block.rotary_emb)
-            block.rotary_emb.get_rotary_embedding(self._model.config.max_sequence_length, torch.device("cpu"))
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        super().__exit__(exc_type, exc_value, traceback)
-        self._model.model.forward = self._model.model._orig_forward
-        self._model.model.get_alibi_attention_bias = self._model.model._orig_get_alibi_attention_bias
-        for block in self._model.model.transformer.blocks:
-            block.rotary_emb.get_rotary_embedding = block.rotary_emb._orig_get_rotary_embedding
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 70c6ceab5d..32fc255a1f 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -525,10 +525,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "starcoder2",
         "phi",
         "internlm2",
+        "orion",
     )
     GENERATION_LENGTH = 100
     IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3")
-    REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "decilm")
+    REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "olmo", "orion")
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index bb11ad7baa..e7f62f1f61 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -36,7 +36,6 @@
     "data2vec_audio": "hf-internal-testing/tiny-random-Data2VecAudioModel",
     "deberta": "hf-internal-testing/tiny-random-deberta",
     "deberta_v2": "hf-internal-testing/tiny-random-DebertaV2Model",
-    "decilm": "katuni4ka/tiny-random-deciml",
     "deit": "hf-internal-testing/tiny-random-deit",
     "convnext": "hf-internal-testing/tiny-random-convnext",
     "distilbert": "hf-internal-testing/tiny-random-distilbert",
@@ -71,6 +70,8 @@
     "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
     "mt5": "stas/mt5-tiny-random",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
+    "olmo": "katuni4ka/tiny-random-olmo",
+    "orion": "katuni4ka/tiny-random-orion",
     "pegasus": "hf-internal-testing/tiny-random-pegasus",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     "phi": "echarlaix/tiny-random-PhiForCausalLM",

From 2e5a791eb7efd5ba3fc30d10691fefedae2a41f3 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 8 Apr 2024 10:20:57 +0200
Subject: [PATCH 4/5] Update optimum/exporters/openvino/__main__.py

---
 optimum/exporters/openvino/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 856ea6798f..bf45ab7e75 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -230,7 +230,7 @@ def main_export(
             logger.warning(
                 "Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`"
                 "using default export configuration, `trust_remote_code` will be disabled. "
-                "Please provide custon export config if you want load model with remote code."
+                "Please provide custom export config if you want load model with remote code."
             )
             trust_remote_code = False
 

From be1a0d14cc160de1ddec0021f81ed72604acf721 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 8 Apr 2024 10:21:02 +0200
Subject: [PATCH 5/5] Update optimum/exporters/openvino/__main__.py

---
 optimum/exporters/openvino/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index bf45ab7e75..dbea798f75 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -228,7 +228,7 @@ def main_export(
             and not custom_export_configs
         ):
             logger.warning(
-                "Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`"
+                f"Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`"
                 "using default export configuration, `trust_remote_code` will be disabled. "
                 "Please provide custom export config if you want load model with remote code."
             )