Add export config for gemma2 (huggingface#876)

eaidova · web-flow · commit 2696e6fe4729 · 2024-09-03T11:56:06.000+02:00
* add export config for gemma2

* update cache position and tests

* update model list

* fix without cache export

* patch original torch gemma2 to work with dynamic cache

* Update tests/openvino/test_modeling.py

* prevent usage cache implementation

* add min transformers version
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -55,6 +55,7 @@ Here is the list of the supported architectures :
 - GPT-NeoX
 - GPT-NeoX-Japanese
 - Gemma
+- Gemma2
 - Hubert
 - IBert
 - InternLM
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -54,6 +54,7 @@
     CodeGenModelPatcher,
     DBRXModelPatcher,
     FalconModelPatcher,
+    Gemma2ModelPatcher,
     GptNeoxJapaneseModelPatcher,
     GptNeoxModelPatcher,
     InternLM2Patcher,
@@ -997,3 +998,23 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager(
+    "gemma2",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class Gemma2OpenVINOConfig(GemmaOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.43.0")
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return Gemma2ModelPatcher(self, model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import functools
 import inspect
 import logging as log
 import math
@@ -23,7 +24,7 @@
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.utils import is_tf_available
 
-from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
+from optimum.exporters.onnx.model_patcher import DecoderModelPatcher, override_arguments
 from optimum.intel.utils.import_utils import (
     _openvino_version,
     _torch_version,
@@ -2409,3 +2410,60 @@ def __enter__(self):
         super().__enter__()
         for layer in self._model.gpt_neox_japanese.layers:
             _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
+
+
+class Gemma2ModelPatcher(LlamaModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        @functools.wraps(self.orig_forward)
+        def patched_forward(*args, **kwargs):
+            from transformers.cache_utils import DynamicCache
+
+            signature = inspect.signature(self.orig_forward)
+            args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs)
+
+            return_legacy_cache = False
+            pkv_in_args = False
+            legacy_pkv = None
+            if "past_key_values" in kwargs:
+                legacy_pkv = kwargs.pop("past_key_values", None)
+            sign_names = list(signature.parameters.keys())
+            pkv_argument_index = sign_names.index("past_key_values")
+            cache_position_index = sign_names.index("cache_position") if "cache_position" in sign_names else -1
+            input_ids_index = sign_names.index("input_ids" if "input_ids" in sign_names else "inputs_embeds")
+            if legacy_pkv is None and len(args) > pkv_argument_index:
+                legacy_pkv = args[pkv_argument_index]
+                pkv_in_args = True
+            if legacy_pkv is not None:
+                pkv = DynamicCache.from_legacy_cache(legacy_pkv)
+                return_legacy_cache = True
+                if not pkv_in_args:
+                    kwargs["past_key_values"] = pkv
+                else:
+                    args[pkv_argument_index] = pkv
+
+            if (
+                return_legacy_cache
+                and cache_position_index != -1
+                and (cache_position_index > len(args) and "cache_position" not in kwargs)
+            ):
+                past_seen_tokens = legacy_pkv[0][0].shape[-2]
+                input_ids = args[input_ids_index]
+                cache_position = torch.arange(
+                    past_seen_tokens, past_seen_tokens + input_ids.shape[1], device=input_ids.device
+                )
+                kwargs["cache_position"] = cache_position
+
+            outputs = self.orig_forward(*args, **kwargs)
+            if return_legacy_cache:
+                outputs.past_key_values = outputs.past_key_values.to_legacy_cache()
+
+            return outputs
+
+        self.patched_forward = patched_forward
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -806,6 +806,8 @@ def _from_pretrained(
                 force_download=force_download,
                 local_files_only=local_files_only,
             )
+            if getattr(generation_config, "cache_implementation", None) is not None:
+                generation_config.cache_implementation = None
             kwargs["generation_config"] = generation_config
         except Exception:
             pass
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -57,6 +57,7 @@
 from transformers.testing_utils import slow
 from utils_tests import MODEL_NAMES
 
+from optimum.exporters.openvino.model_patcher import patch_update_causal_mask
 from optimum.intel import (
     OVModelForAudioClassification,
     OVModelForAudioFrameClassification,
@@ -647,6 +648,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.40.0"):
         SUPPORTED_ARCHITECTURES += (
             "gemma",
+            "gemma2",
             "olmo",
             "stablelm",
             "starcoder2",
@@ -728,7 +730,8 @@ def test_compare_to_transformers(self, model_arch):
         self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=1e-4))
 
         # Qwen tokenizer does not support padding
-        if model_arch == "qwen":
+
+        if model_arch in ["qwen"]:
             return
 
         if model_arch not in ["chatglm", "glm4", "persimmon"]:
@@ -753,7 +756,16 @@ def test_compare_to_transformers(self, model_arch):
         )
 
         ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
-        transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
+        additional_inputs = {}
+        # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache,
+        # align cache representation in torch model
+        if model_arch == "gemma2":
+            patch_update_causal_mask(transformers_model, "4.43.0")
+            transformers_model._supports_cache_class = True
+            from transformers.cache_utils import DynamicCache
+
+            additional_inputs = {"past_key_values": DynamicCache()}
+        transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs)
         self.assertTrue(torch.allclose(ov_outputs, transformers_outputs))
 
         del transformers_model
@@ -921,8 +933,8 @@ def test_beam_search(self, model_arch):
                 "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
                 "trust_remote_code": True,
             }
-        # Qwen tokenizer does not support padding, chatgm testing model produces nan that incompatible with beam search
-        if model_arch in ["qwen", "chatglm"]:
+        # Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search
+        if model_arch in ["qwen", "chatglm", "glm4"]:
             return
 
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
@@ -988,6 +1000,12 @@ def test_beam_search(self, model_arch):
 
         if model_arch == "arctic":
             transformers_model.to(torch.float32)
+        additional_inputs = {}
+        # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, align cache representation in torch model
+        if model_arch == "gemma2":
+            patch_update_causal_mask(transformers_model, "4.43.0")
+            transformers_model._supports_cache_class = True
+            from transformers.cache_utils import DynamicCache
         tokenizer.pad_token_id = tokenizer.eos_token_id
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
         tokens.pop("token_type_ids", None)
@@ -1002,7 +1020,12 @@ def test_beam_search(self, model_arch):
             if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo"]:
                 continue
             set_seed(SEED)
-            transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
+
+            if model_arch == "gemma2":
+                additional_inputs = {"past_key_values": DynamicCache()}
+            transformers_outputs = transformers_model.generate(
+                **tokens, generation_config=gen_config, **additional_inputs
+            )
             set_seed(SEED)
             ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config)
             self.assertTrue(
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -54,6 +54,7 @@
     "electra": "hf-internal-testing/tiny-random-electra",
     "exaone": "katuni4ka/tiny-random-exaone",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
+    "gemma2": "katuni4ka/tiny-random-gemma2",
     "falcon": "fxmarty/really-tiny-falcon-testing",
     "falcon-40b": "katuni4ka/tiny-random-falcon-40b",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",

Original file line number	Diff line number	Diff line change
`@@ -806,6 +806,8 @@ def _from_pretrained(`
`806`	`806`	`force_download=force_download,`
`807`	`807`	`local_files_only=local_files_only,`
`808`	`808`	`)`
	`809`	`+ if getattr(generation_config, "cache_implementation", None) is not None:`
	`810`	`+ generation_config.cache_implementation = None`
`809`	`811`	`kwargs["generation_config"] = generation_config`
`810`	`812`	`except Exception:`
`811`	`813`	`pass`