Fix causal mask update bf16 accuracy issue in gemma (#654)

eaidova · web-flow · commit 91e635e023a1 · 2024-04-09T15:45:07.000+02:00
* fix causal mask update bf16 accuracy issue in gemma

* update llama config
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -19,7 +19,7 @@
 from transformers.utils import is_tf_available
 
 from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
-from optimum.exporters.onnx.model_configs import GemmaOnnxConfig
+from optimum.exporters.onnx.model_configs import GemmaOnnxConfig, LlamaOnnxConfig
 from optimum.exporters.tasks import TasksManager
 from optimum.utils import DEFAULT_DUMMY_SHAPES
 from optimum.utils.input_generators import (
@@ -34,6 +34,7 @@
     BaichuanModelPatcher,
     ChatGLMModelPatcher,
     GemmaModelPatcher,
+    LlamaModelPatcher,
     MixtralModelPatcher,
     QwenModelPatcher,
 )
@@ -274,6 +275,24 @@ def patch_model_for_export(
         return GemmaModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+@register_in_tasks_manager(
+    "llama",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class LlamaOpenVINOConfig(LlamaOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return LlamaModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
 class QwenDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
     def __init__(
         self,
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -288,10 +288,74 @@ def __exit__(self, exc_type, exc_value, traceback):
             block.self_attention.core_attention.forward = block.self_attention.core_attention._orig_forward
 
 
+# adopted from
+# https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/gemma/modeling_gemma.py#L965
+# https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/llama/modeling_llama.py#L1058
+def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_position, **kwargs):
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+
+    # for compatibility with https://github.com/huggingface/transformers/pull/30047
+    current_length = kwargs.get("current_length", cache_position[-1])
+    dtype, device = input_tensor.dtype, input_tensor.device
+
+    # using minimum from dtype with larger bandwith (floa32) may lead to overflow
+    # during execution on platforms with default lower precision (bfloat16, float16)
+    min_dtype = torch.finfo(torch.float16).min
+    sequence_length = input_tensor.shape[1]
+    if hasattr(getattr(self.layers[0], "self_attn", {}), "past_key_value"):  # static cache
+        target_length = self.config.max_position_embeddings
+    else:  # dynamic cache
+        target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length + 1
+
+    causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype
+    if sequence_length != 1:
+        causal_mask = torch.triu(causal_mask, diagonal=1)
+    causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+    causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+    if attention_mask is not None:
+        causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+        if attention_mask.dim() == 2:
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+            causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+        elif attention_mask.dim() == 4:
+            # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
+            # cache. In that case, the 4D attention mask attends to the newest tokens only.
+            if attention_mask.shape[-2] < cache_position[0] + sequence_length:
+                offset = cache_position[0]
+            else:
+                offset = 0
+            mask_shape = attention_mask.shape
+            mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
+            causal_mask[
+                : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
+            ] = mask_slice
+
+    if (
+        self.config._attn_implementation == "sdpa"
+        and attention_mask is not None
+        and attention_mask.device.type == "cuda"
+    ):
+        # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        # Details: https://github.com/pytorch/pytorch/issues/110213
+        causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
 class GemmaModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
 
+        # gemma has some accuracy issues with bf16 with transformers >= 4.39
+        # fill causal mask in slightly different way for avoid overflow on some platforms
+        if is_transformers_version(">=", "4.39.0"):
+            self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
+            self._model.model._update_causal_mask = types.MethodType(
+                _llama_gemma_update_causal_mask, self._model.model
+            )
+
         # init inv_freq for torchscript tracing
         # https://github.com/huggingface/transformers/blob/ed74d97871468f3a4695ede50abdc0b55717a84d/src/transformers/models/gemma/modeling_gemma.py#L108
         for layer in self._model.model.layers:
@@ -301,6 +365,29 @@ def __enter__(self):
                     rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim)
                 )
 
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if hasattr(self._model.model, "_orig_update_causal_mask"):
+            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+
+
+class LlamaModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+
+        # llama has some accuracy issues with bf16 with transformers >= 4.39
+        # fill causal mask in slightly different way for avoid overflow on some platforms
+        if is_transformers_version(">=", "4.39.0"):
+            self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
+            self._model.model._update_causal_mask = types.MethodType(
+                _llama_gemma_update_causal_mask, self._model.model
+            )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if hasattr(self._model.model, "_orig_update_causal_mask"):
+            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+
 
 SUPPORT_SDPA = is_torch_version(">", "2.1.0")
 
@@ -465,7 +552,6 @@ def _qwen_attention_forward(
             raise ValueError("Cannot output attentions while using flash-attn")
         else:
             outputs += (attn_weight,)
-
     return outputs