huggingface
diff --git a/‎.github/workflows/test_inc.yml
+3-3 b/‎.github/workflows/test_inc.yml
+3-3
diff --git a/‎optimum/exporters/openvino/__main__.py
+5 b/‎optimum/exporters/openvino/__main__.py
+5
diff --git a/‎optimum/exporters/openvino/convert.py
+1-1 b/‎optimum/exporters/openvino/convert.py
+1-1
diff --git a/‎optimum/intel/ipex/inference.py
+4 b/‎optimum/intel/ipex/inference.py
+4
diff --git a/‎optimum/intel/ipex/modeling_base.py
+2 b/‎optimum/intel/ipex/modeling_base.py
+2
diff --git a/‎optimum/intel/neural_compressor/modeling_base.py
+1-1 b/‎optimum/intel/neural_compressor/modeling_base.py
+1-1
diff --git a/‎optimum/intel/openvino/modeling_decoder.py
+169-12 b/‎optimum/intel/openvino/modeling_decoder.py
+169-12
@@ -32,7 +32,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
         pip install intel-extension-for-transformers
         pip install peft
@@ -43,7 +43,7 @@ jobs:
     - name: Test IPEX
       run: |
         pip uninstall -y intel-extension-for-transformers
-        pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
-        pip install intel-extension-for-pytorch==2.1.100
+        pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install intel-extension-for-pytorch==2.3.0
         pytest tests/neural_compressor/test_ipex.py
 
@@ -219,6 +219,10 @@ def main_export(
         model_type = config.model_type.replace("_", "-")
         if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
             custom_architecture = True
+            if custom_export_configs is None:
+                raise ValueError(
+                    f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export."
+                )
         elif task not in TasksManager.get_supported_tasks_for_model_type(
             model_type, exporter="openvino", library_name=library_name
         ):
@@ -232,6 +236,7 @@ def main_export(
             raise ValueError(
                 f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}."
             )
+
         if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
             loading_kwargs["attn_implementation"] = "eager"
         # there are some difference between remote and in library representation of past key values for some models,
 
@@ -547,7 +547,7 @@ def export_from_model(
     # TODO: support onnx_config.py in the model repo
     if custom_architecture and custom_export_configs is None:
         raise ValueError(
-            f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model_type} to be supported natively in the ONNX export."
+            f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export."
         )
 
     if task.startswith("text-generation") and model.config.is_encoder_decoder:
 
@@ -97,6 +97,10 @@ def __init__(
             jit (`boolean = False`, *optional*):
                 Enable jit to accelerate inference speed
         """
+        logger.warning(
+            "`inference_mode` is deprecated and will be removed in v1.18.0. Use `pipeline` to load and export your model to TorchScript instead."
+        )
+
         if not is_ipex_available():
             raise ImportError(IPEX_NOT_AVAILABLE_ERROR_MSG)
 
 
@@ -161,6 +161,7 @@ def _from_transformers(
         local_files_only: bool = False,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
         trust_remote_code: bool = False,
+        _commit_hash: str = None,
     ):
         if use_auth_token is not None:
             warnings.warn(
@@ -186,6 +187,7 @@ def _from_transformers(
             "force_download": force_download,
             "torch_dtype": torch_dtype,
             "trust_remote_code": trust_remote_code,
+            "_commit_hash": _commit_hash,
         }
 
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
 
@@ -147,7 +147,7 @@ def _from_pretrained(
             try:
                 quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json")
                 algorithm = getattr(quantization_config, "quant_method", None)
-                if algorithm in {"rtn", "gptq", "awq", "autoaround"}:
+                if algorithm in {"rtn", "gptq", "awq", "autoround"}:
                     from intel_extension_for_transformers.transformers.modeling.modeling_auto import (
                         _BaseQBitsAutoModelClass,
                     )
 
@@ -17,7 +17,7 @@
 import warnings
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import openvino
@@ -28,6 +28,10 @@
 from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.generation import GenerationMixin
+from transformers.generation.configuration_utils import GenerationConfig, GenerationMode
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.utils import GenerateOutput
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 from optimum.utils.normalized_config import NormalizedConfigManager
@@ -41,6 +45,11 @@
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
 
+if TYPE_CHECKING:
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.streamers import BaseStreamer
+
+
 logger = logging.getLogger(__name__)
 
 core = Core()
@@ -122,6 +131,8 @@ def __init__(
         self._pkv_precision = Type.f32
         self.next_beam_idx = None
         self._past_length = 0
+        self._first_iter_beam_search = False
+        self._second_iter_beam_search = False
         self.update_pkv_precision()
         if self.is_dynamic:
             self.model = self._reshape(self.model, -1, -1)
@@ -375,7 +386,11 @@ def prepare_inputs(
         inputs = {}
         if not self.stateful:
             if past_key_values is not None:
-                if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
+                if (
+                    self.config.model_type not in MULTI_QUERY_ATTN_MODELS
+                    or self.config.model_type == "falcon"
+                    and self.config.new_decoder_architecture
+                ):
                     if self._pkv_precision == Type.bf16:
                         # numpy does not support bf16, pretending f16, should change to bf16
                         past_key_values = tuple(
@@ -418,7 +433,6 @@ def prepare_inputs(
                 self.next_beam_idx = np.arange(batch_size, dtype=int)
                 self._past_length = 0
         past_len = self._get_past_length(past_key_values)
-
         inputs["input_ids"] = np.array(input_ids)
         # Add the attention_mask inputs when needed
         if "attention_mask" in self.input_names or "position_ids" in self.input_names:
@@ -468,6 +482,8 @@ def forward(
             **kwargs,
         )
 
+        if self._first_iter_beam_search:
+            inputs, duplication_indices = self._deduplicate_inputs(inputs)
         # Run inference
         self.request.start_async(inputs, share_inputs=True)
         self.request.wait()
@@ -483,14 +499,22 @@ def forward(
             if self.use_cache:
                 # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer)
                 past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names)
-                if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
+                if (
+                    self.config.model_type not in MULTI_QUERY_ATTN_MODELS
+                    or self.config.model_type == "falcon"
+                    and self.config.new_decoder_architecture
+                ):
                     # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention)
                     past_key_values = tuple(
                         past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv)
                     )
             else:
                 past_key_values = None
 
+        if self._first_iter_beam_search:
+            logits, past_key_values = self._expand_outputs_for_generation(duplication_indices, logits, past_key_values)
+            self._first_iter_beam_search = False
+
         return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values)
 
     # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
@@ -520,20 +544,124 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
 
-        return {
+        model_inputs = {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
             "use_cache": use_cache,
             "position_ids": position_ids,
             "attention_mask": attention_mask,
         }
 
+        return model_inputs
+
+    def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple):
+        batch_size = logits.shape[0]
+        if indicies.shape[0] != 1:
+            logits = logits[indicies]
+            if past_key_values and not self.stateful:
+                if (
+                    self.config.model_type not in MULTI_QUERY_ATTN_MODELS
+                    or self.config.model_type == "falcon"
+                    and self.config.new_decoder_architecture
+                ):
+                    past_key_values = tuple(
+                        tuple(
+                            past_state[indicies]
+                            if not self.config.model_type == "chatglm"
+                            else past_state[:, indicies, ...]
+                            for past_state in layer_past
+                        )
+                        for layer_past in past_key_values
+                    )
+                else:
+                    past_key_values = tuple([past_state[indicies] for past_state in past_key_values])
+        if self.stateful:
+            self.next_beam_idx = (
+                self.next_beam_idx[indicies]
+                if self.next_beam_idx is not None
+                else np.arange(batch_size, dtype=int)[indicies]
+            )
+        self._second_iter_beam_search = True
+        return logits, past_key_values
+
+    def _deduplicate_inputs(self, model_inputs: Dict):
+        input_ids = model_inputs["input_ids"]
+        upd_model_inputs = {}
+        unique_input_ids, indicies, reverse_indicies = np.unique(
+            input_ids, axis=0, return_index=True, return_inverse=True
+        )
+        for input_name, input_tensor in model_inputs.items():
+            if input_name not in ["input_ids", "beam_idx"]:
+                if not isinstance(input_tensor, Tensor):
+                    upd_model_inputs[input_name] = input_tensor[indicies]
+                else:
+                    shape = input_tensor.shape
+                    dtype = input_tensor.element_type
+                    upd_batch_size = indicies.shape[0]
+                    if self.config.model_type == "bloom":
+                        upd_batch_size *= self.config.num_attention_heads
+                    shape[0 if not self.config.model_type == "chatglm" else 1] = upd_batch_size
+                    upd_model_inputs[input_name] = Tensor(dtype, shape)
+        upd_model_inputs["input_ids"] = unique_input_ids
+        if "beam_idx" in model_inputs:
+            beam_range = (
+                unique_input_ids.shape[0]
+                if self.config.model_type != "bloom"
+                else unique_input_ids.shape[0] * self.config.num_attention_heads
+            )
+            beam_idx = np.arange(beam_range, dtype=int)
+            upd_model_inputs["beam_idx"] = beam_idx
+        return upd_model_inputs, reverse_indicies
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        _generation_config, _ = self._prepare_generation_config(generation_config, **kwargs)
+        generation_mode = _generation_config.get_generation_mode(assistant_model)
+
+        is_beam_search = generation_mode in [
+            GenerationMode.BEAM_SEARCH,
+            GenerationMode.BEAM_SAMPLE,
+            GenerationMode.GROUP_BEAM_SEARCH,
+            GenerationMode.CONSTRAINED_BEAM_SEARCH,
+        ]
+        if is_beam_search:
+            self._first_iter_beam_search = True
+        result = super().generate(
+            inputs,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            assistant_model,
+            streamer,
+            negative_prompt_ids,
+            negative_prompt_attention_mask,
+            **kwargs,
+        )
+        return result
+
     def _get_past_length(self, past_key_values=None):
         if past_key_values is None:
             return 0
         if self.stateful:
             return self._past_length
-        if self.config.model_type in MULTI_QUERY_ATTN_MODELS:
+        if self.config.model_type in MULTI_QUERY_ATTN_MODELS and not (
+            self.config.model_type == "falcon" and self.config.new_decoder_architecture
+        ):
             return past_key_values[0].shape[-2]
         seq_length_dim = -2
         if self.config.model_type == "chatglm":
@@ -558,12 +686,20 @@ def _reorder_cache(
         if self.stateful:
             # TODO: Apply it differently based on model type
             # TODO: At least for bloom we need to replicate values for each attention head
-            self.next_beam_idx = np.array(beam_idx)  # save beam_idx to be used as an input in the next iteration
+            self.next_beam_idx = (
+                np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx
+            )  # save beam_idx to be used as an input in the next iteration
+            self._second_iter_beam_search = False
             return past_key_values
         else:
-            return tuple(
-                tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values
-            )
+            if self.config.model_type not in MULTI_QUERY_ATTN_MODELS and not (
+                self.config.model_type == "falcon" and self.config.new_decoder_architecture
+            ):
+                return tuple(
+                    tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past)
+                    for layer_past in past_key_values
+                )
+            return tuple(np.take(past_state, beam_idx, 0) for past_state in past_key_values)
 
     def can_generate(self):
         """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
@@ -684,11 +820,12 @@ def _reorder_cache(
         This is required to match `past_key_values` with the correct beam_idx at every generation step.
         """
         if self.stateful:
-            beam_idx = np.array(beam_idx)
             batch_size = beam_idx.shape[0]
+            beam_idx = np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx
             indices = np.array(range(batch_size * self.config.num_attention_heads))
             indices = indices.reshape([batch_size, self.config.num_attention_heads])
             self.next_beam_idx = np.take(indices, beam_idx, 0).flatten()
+            self._second_iter_beam_search = False
             return past_key_values
         else:
             standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx))
@@ -738,14 +875,34 @@ def _convert_to_standard_cache(
             for layer_past in past_key_value
         )
 
+    def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple):
+        batch_size = logits.shape[0]
+        if indicies.shape[0] != 1:
+            logits = logits[indicies]
+            if past_key_values and not self.stateful:
+                pkv_standard = self._convert_to_standard_cache(past_key_values, batch_size)
+                pkv = tuple(tuple(past_state[indicies] for past_state in layer_past) for layer_past in pkv_standard)
+                past_key_values = self._convert_to_bloom_cache(pkv)
+
+        if self.stateful:
+            self.next_beam_idx = (
+                self.next_beam_idx[indicies]
+                if self.next_beam_idx is not None
+                else np.arange(batch_size, dtype=int)[indicies]
+            )
+        self._second_iter_beam_search = True
+        return logits, past_key_values
+
 
 class OVGPTBigCodeForCausalLM(OVModelForCausalLM):
     # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache
     def _reorder_cache(
         self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
     ) -> Tuple[Tuple[torch.Tensor]]:
         if self.stateful:
-            self.next_beam_idx = np.array(beam_idx)  # save beam_idx to be used as an input in the next iteration
+            # save beam_idx to be used as an input in the next iteration
+            self.next_beam_idx = np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx
+            self._second_iter_beam_search = False
             return past_key_values
         else:
             return tuple(np.take(layer_past, beam_idx, 0) for layer_past in past_key_values)
Original file line number	Diff line number	Diff line change
`@@ -547,7 +547,7 @@ def export_from_model(`
`547`	`547`	`# TODO: support onnx_config.py in the model repo`
`548`	`548`	`if custom_architecture and custom_export_configs is None:`
`549`	`549`	`raise ValueError(`
`550`		- f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model_type} to be supported natively in the ONNX export."
	`550`	+ f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export."
`551`	`551`	`)`
`552`	`552`
`553`	`553`	`if task.startswith("text-generation") and model.config.is_encoder_decoder:`
Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ def _from_pretrained(`
`147`	`147`	`try:`
`148`	`148`	`quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json")`
`149`	`149`	`algorithm = getattr(quantization_config, "quant_method", None)`
`150`		`- if algorithm in {"rtn", "gptq", "awq", "autoaround"}:`
	`150`	`+ if algorithm in {"rtn", "gptq", "awq", "autoround"}:`
`151`	`151`	`from intel_extension_for_transformers.transformers.modeling.modeling_auto import (`
`152`	`152`	`_BaseQBitsAutoModelClass,`
`153`	`153`	`)`