From 08e3c3df0c00ebc1c8bbc7f68010079dda28090a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 16 May 2024 12:00:34 +0200
Subject: [PATCH 01/13] Fix compatibility for transformers v4.41.0 llama and
 gemma modeling patching

---
 optimum/exporters/openvino/model_patcher.py | 104 +++++++++++++++++++-
 setup.py                                    |   2 +-
 2 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index f68e873d40..804adda04d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -293,7 +293,7 @@ def __exit__(self, exc_type, exc_value, traceback):
 # adopted from
 # https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/gemma/modeling_gemma.py#L965
 # https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/llama/modeling_llama.py#L1058
-def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_position, past_seen_tokens=None):
+def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, cache_position, past_seen_tokens=None):
     from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 
     if self.config._attn_implementation == "sdpa" and past_seen_tokens is not None:
@@ -306,10 +306,12 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po
 
     dtype, device = input_tensor.dtype, input_tensor.device
 
+    # difference with original modeling
     # using minimum from dtype with larger bandwith (floa32) may lead to overflow
     # during execution on platforms with default lower precision (bfloat16, float16)
     min_dtype = torch.finfo(torch.float16).min
     sequence_length = input_tensor.shape[1]
+    # difference with original modeling
     if hasattr(getattr(self.layers[0], "self_attn", {}), "past_key_value"):  # static cache
         target_length = self.config.max_position_embeddings
     else:  # dynamic cache
@@ -321,7 +323,9 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po
 
         target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length
 
+    # difference with original modeling
     causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype
+
     if sequence_length != 1:
         causal_mask = torch.triu(causal_mask, diagonal=1)
     causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
@@ -358,6 +362,104 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po
     return causal_mask
 
 
+# adopted from https://github.com/huggingface/transformers/blob/f4014e75db0190792b3feeccfc5dc5b5f9f0ce7b/src/transformers/models/llama/modeling_llama.py#L1036
+def _llama_gemma_update_causal_mask_latest(
+    self,
+    attention_mask,
+    input_tensor,
+    cache_position,
+    past_key_values,
+    output_attentions,
+):
+    from transformers.cache_utils import StaticCache
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+    if self.config._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+
+    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+    # to infer the attention mask.
+    past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+
+    # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+    if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if AttentionMaskConverter._ignore_causal_mask_sdpa(
+            attention_mask,
+            inputs_embeds=input_tensor,
+            past_key_values_length=past_seen_tokens,
+            is_training=self.training,
+        ):
+            return None
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    # difference with original modeling
+    # using minimum from dtype with larger bandwith (floa32) may lead to overflow
+    # during execution on platforms with default lower precision (bfloat16, float16)
+    min_dtype = torch.finfo(torch.float16).min
+
+    sequence_length = input_tensor.shape[1]
+    if using_static_cache:
+        target_length = past_key_values.get_max_length()
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+        if attention_mask.max() != 0:
+            raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+        causal_mask = attention_mask
+    else:
+        # difference with original modeling
+        causal_mask = (
+            torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype
+        )
+
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+    if (
+        self.config._attn_implementation == "sdpa"
+        and attention_mask is not None
+        and attention_mask.device.type == "cuda"
+        and not output_attentions
+    ):
+        # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        # Details: https://github.com/pytorch/pytorch/issues/110213
+        causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
+# TODO : deprecate _llama_gemma_update_causal_mask_legacy when transformers>=4.41.0
+if is_transformers_version(">=", "4.41.0"):
+    _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_latest
+else:
+    _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_legacy
+
+
 class GemmaModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
diff --git a/setup.py b/setup.py
index 251ec61cdd..dd7b157b33 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers>=4.36.0,<4.41.0",
+    "transformers @ git+https://github.com/huggingface/transformers.git",
     "optimum~=1.19",
     "datasets>=1.4.0",
     "sentencepiece",

From ddd35be27fd15c68ab7f8752f85bb96c87bf92c2 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 16 May 2024 16:36:03 +0200
Subject: [PATCH 02/13] install from source

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index dd7b157b33..edb914ec61 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,8 @@
 INSTALL_REQUIRE = [
     "torch>=1.11",
     "transformers @ git+https://github.com/huggingface/transformers.git",
-    "optimum~=1.19",
+    # "optimum~=1.19",
+    "optimum @ git+https://github.com/huggingface/optimum.git@bump-transformers",
     "datasets>=1.4.0",
     "sentencepiece",
     "scipy",

From 0b1af86e9ee772da5a1acd709a0b319b349ecd35 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 17 May 2024 17:59:42 +0200
Subject: [PATCH 03/13] fix for dev transformers version

---
 optimum/exporters/openvino/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 804adda04d..1699c6d362 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -454,7 +454,7 @@ def _llama_gemma_update_causal_mask_latest(
 
 
 # TODO : deprecate _llama_gemma_update_causal_mask_legacy when transformers>=4.41.0
-if is_transformers_version(">=", "4.41.0"):
+if is_transformers_version(">", "4.40.0"):
     _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_latest
 else:
     _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_legacy

From 580bd64aff80759f415c9c1492593ac55343db3a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 17 May 2024 18:09:00 +0200
Subject: [PATCH 04/13] update setup

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index edb914ec61..9f82beab50 100644
--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@
     "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"],
     "openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"],
     "nncf": ["nncf>=2.10.0"],
-    "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"],
+    "ipex": ["intel-extension-for-pytorch"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,

From 56582136e6b14b176db7b9b558e39fc94337b343 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 17 May 2024 18:15:13 +0200
Subject: [PATCH 05/13] update setup

---
 setup.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 9f82beab50..0057c34c90 100644
--- a/setup.py
+++ b/setup.py
@@ -28,9 +28,8 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers @ git+https://github.com/huggingface/transformers.git",
-    # "optimum~=1.19",
-    "optimum @ git+https://github.com/huggingface/optimum.git@bump-transformers",
+    "transformers>=4.36.0,<4.42.0",
+    "optimum~=1.19",
     "datasets>=1.4.0",
     "sentencepiece",
     "scipy",
@@ -63,7 +62,7 @@
     "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"],
     "openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"],
     "nncf": ["nncf>=2.10.0"],
-    "ipex": ["intel-extension-for-pytorch"],
+    "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,

From 89116017563737d0ef16e3e3037b14b70f801953 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 17 May 2024 18:59:13 +0200
Subject: [PATCH 06/13] fix opset

---
 optimum/exporters/openvino/model_configs.py | 16 ++++++++++++++++
 optimum/intel/openvino/trainer.py           |  8 ++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 575f1cc4db..5b6a83a6cd 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -28,6 +28,7 @@
     UNetOnnxConfig,
     VaeDecoderOnnxConfig,
     VaeEncoderOnnxConfig,
+    Wav2Vec2OnnxConfig,
 )
 from optimum.exporters.tasks import TasksManager
 from optimum.utils import DEFAULT_DUMMY_SHAPES
@@ -87,6 +88,21 @@ def init_model_configs():
 register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True)
 
 
+@register_in_tasks_manager(
+    "wav2vec2",
+    *[
+        "feature-extraction",
+        "automatic-speech-recognition",
+        "audio-classification",
+        "audio-frame-classification",
+        "audio-xvector",
+    ],
+    library_name="transformers",
+)
+class Wav2Vec2OpenVINOConfig(Wav2Vec2OnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+
 @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index 0745a1cd79..b5fea9a663 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -906,17 +906,17 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
             output_path = os.path.join(output_dir, OV_XML_FILE_NAME)
             self.compression_controller.prepare_for_export()
             model_type = self.model.config.model_type.replace("_", "-")
-            onnx_config_class = TasksManager.get_exporter_config_constructor(
-                exporter="onnx",
+            exporter_config_class = TasksManager.get_exporter_config_constructor(
+                exporter="openvino",
                 model=self.model,
                 task=self.task,
                 model_type=model_type,
             )
 
             if self.task == "text-generation":
-                onnx_config = onnx_config_class(self.model.config, use_past=self.model.config.use_cache)
+                onnx_config = exporter_config_class(self.model.config, use_past=self.model.config.use_cache)
             else:
-                onnx_config = onnx_config_class(self.model.config)
+                onnx_config = exporter_config_class(self.model.config)
 
             num_parameters = self.model.num_parameters()
             save_as_external_data = use_external_data_format(num_parameters) or self.ov_config.save_onnx_model

From 8fb8cfee713cf3839d3f25fd213563883dd7a722 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 17 May 2024 19:06:45 +0200
Subject: [PATCH 07/13] fix quant op

---
 tests/openvino/test_training.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py
index c998d00d8b..5ce7f662ce 100644
--- a/tests/openvino/test_training.py
+++ b/tests/openvino/test_training.py
@@ -730,7 +730,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
     "quantization": OVTrainerTestDescriptor(
         model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2],
-        expected_fake_quantize=40,
+        expected_fake_quantize=24,
         expected_int8=30,
         compression_metrics=["compression_loss"],
     ),
@@ -757,7 +757,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
     "quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
         model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
-        expected_fake_quantize=40,
+        expected_fake_quantize=24,
         expected_int8=30,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
@@ -775,7 +775,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         teacher_model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
-        expected_fake_quantize=40,
+        expected_fake_quantize=24,
         expected_int8=30,
         expected_binary_masks=48,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],

From 559380c15407196fd96898b659cf8d56e7d81a9a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 17 May 2024 23:31:39 +0200
Subject: [PATCH 08/13] fix

---
 tests/openvino/test_quantization.py | 2 +-
 tests/openvino/test_training.py     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 98eb121d72..fa1349de2a 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -663,7 +663,7 @@ def preprocess_function(examples, tokenizer):
 
 
 class OVTrainerTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 49, 38),)
+    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 50, 38),)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
     def test_aware_training_quantization(self, model_name, expected_fake_quantize, expected_int8):
diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py
index 5ce7f662ce..c998d00d8b 100644
--- a/tests/openvino/test_training.py
+++ b/tests/openvino/test_training.py
@@ -730,7 +730,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
     "quantization": OVTrainerTestDescriptor(
         model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2],
-        expected_fake_quantize=24,
+        expected_fake_quantize=40,
         expected_int8=30,
         compression_metrics=["compression_loss"],
     ),
@@ -757,7 +757,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
     "quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
         model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
-        expected_fake_quantize=24,
+        expected_fake_quantize=40,
         expected_int8=30,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
@@ -775,7 +775,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         teacher_model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
-        expected_fake_quantize=24,
+        expected_fake_quantize=40,
         expected_int8=30,
         expected_binary_masks=48,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],

From 76ae2db6b102c4d80c2878ec5a95dbb62ce7d76c Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Sat, 18 May 2024 00:01:34 +0200
Subject: [PATCH 09/13] fix

---
 optimum/exporters/openvino/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 678cd39e3b..11698a380d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -454,7 +454,7 @@ def _llama_gemma_update_causal_mask_latest(
 
 
 # TODO : deprecate _llama_gemma_update_causal_mask_legacy when transformers>=4.41.0
-if is_transformers_version(">", "4.40.0"):
+if is_transformers_version(">", "4.40.2"):
     _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_latest
 else:
     _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_legacy

From c3c9b3cfb9e2cd3c9b04e0461ec2fa2d174004a4 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 21 May 2024 16:43:57 +0200
Subject: [PATCH 10/13] fix test

---
 setup.py                            | 2 +-
 tests/openvino/test_quantization.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 0057c34c90..0ccaa1f202 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
 INSTALL_REQUIRE = [
     "torch>=1.11",
     "transformers>=4.36.0,<4.42.0",
-    "optimum~=1.19",
+    "optimum @ git+https://github.com/huggingface/optimum.git@bump-transformers",
     "datasets>=1.4.0",
     "sentencepiece",
     "scipy",
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index fa1349de2a..98eb121d72 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -663,7 +663,7 @@ def preprocess_function(examples, tokenizer):
 
 
 class OVTrainerTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 50, 38),)
+    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 49, 38),)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
     def test_aware_training_quantization(self, model_name, expected_fake_quantize, expected_int8):

From 55052eb4db1b9e9446d0b84aa74ea2e6bd3de767 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 21 May 2024 18:59:23 +0200
Subject: [PATCH 11/13] remove wav2vec config

---
 optimum/exporters/openvino/model_configs.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 00269d1ba2..8feeafd619 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -28,7 +28,6 @@
     UNetOnnxConfig,
     VaeDecoderOnnxConfig,
     VaeEncoderOnnxConfig,
-    Wav2Vec2OnnxConfig,
 )
 from optimum.exporters.tasks import TasksManager
 from optimum.utils import DEFAULT_DUMMY_SHAPES
@@ -91,21 +90,6 @@ def init_model_configs():
 register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True)
 
 
-@register_in_tasks_manager(
-    "wav2vec2",
-    *[
-        "feature-extraction",
-        "automatic-speech-recognition",
-        "audio-classification",
-        "audio-frame-classification",
-        "audio-xvector",
-    ],
-    library_name="transformers",
-)
-class Wav2Vec2OpenVINOConfig(Wav2Vec2OnnxConfig):
-    DEFAULT_ONNX_OPSET = 14
-
-
 @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 13

From 63bf2ab231f15127778b6b041b09ba83b34c26e6 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 22 May 2024 16:07:24 +0200
Subject: [PATCH 12/13] fix setup

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 0ccaa1f202..251ec61cdd 100644
--- a/setup.py
+++ b/setup.py
@@ -28,8 +28,8 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers>=4.36.0,<4.42.0",
-    "optimum @ git+https://github.com/huggingface/optimum.git@bump-transformers",
+    "transformers>=4.36.0,<4.41.0",
+    "optimum~=1.19",
     "datasets>=1.4.0",
     "sentencepiece",
     "scipy",

From 2bd08f68b064dedf66f59d8df882fc2b43760150 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 22 May 2024 16:28:15 +0200
Subject: [PATCH 13/13] fix exporter

---
 optimum/intel/openvino/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index b5fea9a663..c8b29800fa 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -907,7 +907,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
             self.compression_controller.prepare_for_export()
             model_type = self.model.config.model_type.replace("_", "-")
             exporter_config_class = TasksManager.get_exporter_config_constructor(
-                exporter="openvino",
+                exporter="onnx",
                 model=self.model,
                 task=self.task,
                 model_type=model_type,