diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py index e6ec1879a5..c90c2211ed 100644 --- a/optimum/exporters/openvino/stateful.py +++ b/optimum/exporters/openvino/stateful.py @@ -22,7 +22,6 @@ from openvino.runtime import opset13 from optimum.exporters import TasksManager from optimum.intel.utils.import_utils import _openvino_version, is_openvino_version -from optimum.utils.normalized_config import NormalizedConfigManager def model_has_state(ov_model: ov.Model): @@ -217,9 +216,7 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model): batch_dim = 1 if config.model_type == "chatglm" else 0 fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim) - - normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) - num_attention_heads = normalized_config.num_attention_heads if config.model_type == "bloom" else 1 + num_attention_heads = config.num_attention_heads if config.model_type == "bloom" else 1 make_stateful( ov_model, not_kv_inputs, key_value_input_names, key_value_output_names, batch_dim, num_attention_heads, None ) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 1644022c29..64135266b3 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -27,7 +27,7 @@ from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.modeling_outputs import CausalLMOutputWithPast -from optimum.utils import NormalizedConfigManager +from optimum.utils.normalized_config import NormalizedConfigManager from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful from ...exporters.openvino.stateful import model_has_state @@ -132,7 +132,6 @@ def __init__( self.stateful = model_has_sinks self.main_input_name = "input_ids" self.num_pkv = 2 - self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) self.key_value_input_names = [key for key in self.input_names if "key_values" in key] self.key_value_output_names = [key for key in self.output_names if "present" in key] self._original_model = self.model.clone() # keep original model for serialization @@ -321,6 +320,13 @@ def reshape(self, batch_size: int, sequence_length: int): logger.warning("Static shapes are not supported for causal language model.") return self + @property + def normalized_config(self): + logger.warning( + "access to normalized_config attribute is deprecated and will be removed in future versions, please use config" + ) + return NormalizedConfigManager.get_normalized_config_class(self.config.model_type)(self.config) + def compile(self): if self.request is None: super().compile() @@ -364,7 +370,7 @@ def forward( batch_size = input_ids.shape[0] if self.config.model_type == "bloom": - batch_size *= self.normalized_config.num_attention_heads + batch_size *= self.config.num_attention_heads inputs = {} past_len = 0 @@ -592,8 +598,8 @@ def _reorder_cache( if self.stateful: beam_idx = np.array(beam_idx) batch_size = beam_idx.shape[0] - indices = np.array(range(batch_size * self.normalized_config.num_attention_heads)) - indices = indices.reshape([batch_size, self.normalized_config.num_attention_heads]) + indices = np.array(range(batch_size * self.config.num_attention_heads)) + indices = indices.reshape([batch_size, self.config.num_attention_heads]) self.next_beam_idx = np.take(indices, beam_idx, 0).flatten() return past_key_values else: