mixtral and model patcher

eaidova · eaidova · commit eb044a94cc83 · 2024-02-20T11:48:48.000+04:00
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -59,7 +59,7 @@ def main_export(
     local_files_only: bool = False,
     use_auth_token: Optional[Union[bool, str]] = None,
     model_kwargs: Optional[Dict[str, Any]] = None,
-    custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
+    custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
     compression_option: Optional[str] = None,
     compression_ratio: Optional[float] = None,
@@ -112,11 +112,11 @@ def main_export(
             when running `transformers-cli login` (stored in `~/.huggingface`).
         model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
             Experimental usage: keyword arguments to pass to the model during
-            the export. This argument should be used along the `custom_onnx_configs` argument
+            the export. This argument should be used along the `custom_export_configs` argument
             in case, for example, the model inputs/outputs are changed (for example, if
             `model_kwargs={"output_attentions": True}` is passed).
-        custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`):
-            Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model).
+        custom_export_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`):
+            Experimental usage: override the default export config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model).
         fn_get_submodels (`Optional[Callable]`, defaults to `None`):
             Experimental usage: Override the default submodels that are used at the export. This is
             especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
@@ -134,7 +134,7 @@ def main_export(
     ```python
     >>> from optimum.exporters.openvino import main_export
 
-    >>> main_export("gpt2", output="gpt2_onnx/")
+    >>> main_export("gpt2", output="gpt2_ov/")
     ```
     """
     original_task = task
@@ -183,14 +183,14 @@ def main_export(
         if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
             custom_architecture = True
         elif task not in TasksManager.get_supported_tasks_for_model_type(
-            model_type, exporter="onnx", library_name=library_name
+            model_type, exporter="openvino", library_name=library_name
         ):
             if original_task == "auto":
                 autodetected_message = " (auto-detected)"
             else:
                 autodetected_message = ""
             model_tasks = TasksManager.get_supported_tasks_for_model_type(
-                model_type, exporter="onnx", library_name=library_name
+                model_type, exporter="openvino", library_name=library_name
             )
             raise ValueError(
                 f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}."
@@ -265,7 +265,7 @@ class StoreAttr(object):
         not custom_architecture
         and library_name != "diffusers"
         and task + "-with-past"
-        in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="onnx", library_name=library_name)
+        in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="openvino", library_name=library_name)
     ):
         # Make -with-past the default if --task was not explicitely specified
         if original_task == "auto":
@@ -297,7 +297,7 @@ class StoreAttr(object):
         compression_ratio=compression_ratio,
         stateful=stateful,
         model_kwargs=model_kwargs,
-        custom_onnx_configs=custom_onnx_configs,
+        custom_export_configs=custom_export_configs,
         fn_get_submodels=fn_get_submodels,
         preprocessors=preprocessors,
         device=device,
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -12,12 +12,26 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+
+from packaging import version
+from transformers.utils import is_tf_available
 
 from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
 from optimum.exporters.tasks import TasksManager
 from optimum.utils.input_generators import DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator
 from optimum.utils.normalized_config import NormalizedTextConfig
 
+from .model_patcher import MixtralModelPatcher
+
+
+if TYPE_CHECKING:
+    from transformers.modeling_utils import PreTrainedModel
+
+    from optimum.exporters.onnx.model_patcher import ModelPatcher
+
+    if is_tf_available():
+        from transformers.modeling_tf_utils import TFPreTrainedModel
 
 register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True)
 
@@ -54,3 +68,22 @@ class MiniCPMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+@register_in_tasks_manager("mixtral", *["text-generation", "text-generation-with-past"])
+class MixtralOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35
+    MIN_TRANSFORMERS_VERSION = version.parse("4.34.99")
+
+    # The ONNX export of this architecture needs the Trilu operator support, available since opset 14
+    DEFAULT_ONNX_OPSET = 14
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        MistralDummyPastKeyValuesGenerator,
+    ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+    DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return MixtralModelPatcher(self, model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -13,7 +13,12 @@
 #  limitations under the License.
 
 import logging as log
+import types
 
+import torch
+import torch.nn.functional as F
+
+from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
 from optimum.intel.utils.import_utils import (
     _torch_version,
     _transformers_version,
@@ -52,3 +57,53 @@ def patch_model_with_bettertransformer(model):
         return model
 
     return model
+
+
+def mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    """ """
+    batch_size, sequence_length, hidden_dim = hidden_states.shape
+    hidden_states = hidden_states.view(-1, hidden_dim)
+    # router_logits: (batch * sequence_length, n_experts)
+    router_logits = self.gate(hidden_states)
+
+    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+    routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+    # we cast back to the input dtype
+    routing_weights = routing_weights.to(hidden_states.dtype)
+
+    final_hidden_states = torch.zeros(
+        (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+    )
+
+    # One hot encode the selected experts to create an expert mask
+    # this will be used to easily index which expert is going to be sollicitated
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+    # Loop over all available experts in the model and perform the computation on each expert
+    for expert_idx in range(self.num_experts):
+        expert_layer = self.experts[expert_idx]
+        idx, top_x = torch.where(expert_mask[expert_idx])
+
+        # Index the correct hidden states and compute the expert hidden state for
+        # the current expert. We need to make sure to multiply the output hidden
+        # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+        current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+        current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+        final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+    final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+    return final_hidden_states, router_logits
+
+
+class MixtralModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.model.layers:
+            layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward
+            layer.block_sparse_moe.forward = types.MethodType(mixtral_sparse_moe_block_forward, layer.block_sparse_moe)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        for layer in self._model.model.layers:
+            layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -486,6 +486,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "llama_gptq",
         "marian",
         "mistral",
+        "mixtral",
         "mpt",
         "opt",
         "pegasus",
@@ -520,7 +521,10 @@ def test_compare_to_transformers(self, model_arch):
         self.assertIsInstance(ov_outputs.logits, torch.Tensor)
         self.assertTrue("past_key_values" in ov_outputs)
         self.assertIsInstance(ov_outputs.past_key_values, tuple)
-        if self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode":
+        not_stateful = ["gpt_bogcode"]
+        if is_openvino_version("<", "2024.0"):
+            not_stateful.append("mixtral")
+        if self.IS_SUPPORT_STATEFUL and model_arch not in not_stateful:
             self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0)
         with torch.no_grad():
             transformers_outputs = transformers_model(**tokens)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -56,6 +56,7 @@
     "marian": "sshleifer/tiny-marian-en-de",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "mistral": "echarlaix/tiny-random-mistral",
+    "mixtral": "TitanML/tiny-mixtral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
     "mobilenet_v1": "google/mobilenet_v1_0.75_192",
     "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model",