huggingface
diff --git a/‎optimum/commands/export/openvino.py
+11 b/‎optimum/commands/export/openvino.py
+11
diff --git a/‎optimum/exporters/openvino/convert.py
+6-1 b/‎optimum/exporters/openvino/convert.py
+6-1
diff --git a/‎optimum/exporters/openvino/model_configs.py
+148-2 b/‎optimum/exporters/openvino/model_configs.py
+148-2
@@ -119,6 +119,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
         ),
     )
+    optional_group.add_argument(
+        "--all-layers",
+        action="store_true",
+        default=None,
+        help=(
+            "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight "
+            "compression is applied, they are compressed to INT8."
+        ),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -198,6 +207,7 @@ def run(self):
                 and self.args.ratio is None
                 and self.args.group_size is None
                 and self.args.sym is None
+                and self.args.all_layers is None
                 and self.args.model in _DEFAULT_4BIT_CONFIGS
             ):
                 quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model]
@@ -207,6 +217,7 @@ def run(self):
                     "ratio": 1 if is_int8 else (self.args.ratio or 0.8),
                     "sym": self.args.sym or False,
                     "group_size": -1 if is_int8 else self.args.group_size,
+                    "all_layers": None if is_int8 else self.args.all_layers,
                 }
 
             if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
 
@@ -614,7 +614,12 @@ def export_from_model(
         model.config.save_pretrained(output)
         generation_config = getattr(model, "generation_config", None)
         if generation_config is not None:
-            generation_config.save_pretrained(output)
+            try:
+                generation_config.save_pretrained(output)
+            except Exception as exception:
+                logger.warning(
+                    f"The generation config will not be saved, saving failed with following error:\n{exception}"
+                )
 
         model_name_or_path = model.config._name_or_path
         maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
 
@@ -42,15 +42,18 @@
 from optimum.utils.normalized_config import NormalizedTextConfig
 
 from .model_patcher import (
+    AquilaModelPatcher,
     BaichuanModelPatcher,
     ChatGLMModelPatcher,
     GemmaModelPatcher,
-    InternLMPatcher,
+    InternLM2Patcher,
+    InternLMModelPatcher,
     LlamaModelPatcher,
     MixtralModelPatcher,
     MPTModelPatcher,
     Phi3ModelPatcher,
     QwenModelPatcher,
+    XverseModelPatcher,
 )
 
 
@@ -461,7 +464,7 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return InternLMPatcher(self, model, model_kwargs=model_kwargs)
+        return InternLM2Patcher(self, model, model_kwargs=model_kwargs)
 
 
 @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers")
@@ -501,6 +504,12 @@ def patch_model_for_export(
     library_name="transformers",
 )
 class Phi3OpenVINOConfig(PhiOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        MistralDummyPastKeyValuesGenerator,
+    ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+    DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
+
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
@@ -608,3 +617,140 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         return {
             "sample": {0: "batch_size", 2: "height", 3: "width"},
         }
+
+
+@register_in_tasks_manager(
+    "persimmon",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class PersimmonOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+@register_in_tasks_manager("biogpt", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class BioGPTOpenVINOConfig(TextDecoderOnnxConfig):
+    # BioGPT does not require position_ids input.
+    DEFAULT_ONNX_OPSET = 13
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+@register_in_tasks_manager(
+    "gpt-neox-japanese", *["text-generation", "text-generation-with-past"], library_name="transformers"
+)
+class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig):
+    # GPTNeoxJapanese does not require position_ids input.
+    DEFAULT_ONNX_OPSET = 13
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+@register_in_tasks_manager(
+    "cohere",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class CohereOpenVINOConfig(LlamaOpenVINOConfig):
+    pass
+
+
+@register_in_tasks_manager("xglm", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class XGLMConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 13
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
+        num_attention_heads="attention_heads", hidden_size="d_model"
+    )
+
+
+class AquilaDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        random_sequence_length_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            task,
+            normalized_config,
+            batch_size,
+            sequence_length,
+            random_batch_size_range,
+            random_sequence_length_range,
+            **kwargs,
+        )
+        self.num_key_value_heads = getattr(
+            normalized_config, "num_key_value_heads", normalized_config.num_attention_heads
+        )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        shape = (
+            self.batch_size,
+            self.num_key_value_heads,
+            self.sequence_length,
+            self.hidden_size // self.num_attention_heads,
+        )
+        return [
+            (
+                self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+                self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+            )
+            for _ in range(self.num_layers)
+        ]
+
+
+@register_in_tasks_manager("aquila", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, AquilaDummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = AquilaDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return AquilaModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("xverse", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return XverseModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("internlm", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class InternLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return InternLMModelPatcher(self, model, model_kwargs=model_kwargs)