From b11becb7e8c757dd1cb860f34344976d0d34ac31 Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 22 Mar 2024 10:30:18 +0400 Subject: [PATCH 1/5] support more models in export --- optimum/exporters/openvino/convert.py | 2 +- optimum/exporters/openvino/model_configs.py | 71 ++++++ optimum/exporters/openvino/model_patcher.py | 264 +++++++++++++++++++- 3 files changed, 334 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 98dd22d824..ccc046ce55 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -345,7 +345,7 @@ def ts_patched_forward(*args, **kwargs): input_dict = dict(zip(keys, tuple_input)) kwargs[input_name] = input_dict outputs = patched_forward(*args, **kwargs) - return tuple(outputs.values()) + return tuple([value if not isinstance(value, list) else tuple(value) for value in outputs.values()]) patcher.patched_forward = ts_patched_forward diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a274b3671d..ddb6223951 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -35,6 +35,7 @@ ChatGLMModelPatcher, GemmaModelPatcher, MixtralModelPatcher, + OLMoModelPatcher, QwenModelPatcher, ) @@ -400,3 +401,73 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("olmo", *["text-generation", "text-generation-with-past"], library_name="transformers") +class OLMoOpenVINOConfig(TextDecoderOnnxConfig): + # OLMo does not require position_ids input. + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return OLMoModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers") +class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + ) + self.num_key_value_heads_per_layer = normalized_config.num_key_value_heads_per_layer + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + past_key_values = [] + + for layer_id in range(self.num_layers): + shape = ( + self.batch_size, + self.num_key_value_heads_per_layer[layer_id], + self.sequence_length, + self.hidden_size // self.num_attention_heads, + ) + past_key_values.append( + ( + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + ) + ) + return past_key_values + + +@register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers") +class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 2cedf64b0a..9a78a23e9d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -13,8 +13,9 @@ # limitations under the License. import logging as log +import math import types -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -513,5 +514,264 @@ def __init__( ): super().__init__(config, model, model_kwargs) # model has first inference buffers initialization - if self._model.lm_head.first_flag: + if hasattr(self._model.lm_head, "first_flag"): self._model(torch.ones((1, 10), dtype=torch.int64), torch.ones((1, 10), dtype=torch.int64)) + + +class OlmoOutput(NamedTuple): + logits: torch.FloatTensor + """ + A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities + for the next token *before* normalization via (log) softmax. + """ + + attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] + """ + Attention keys and values from each block. + """ + + hidden_states: Optional[Tuple[torch.Tensor]] + """ + Hidden states from each block. + """ + + +def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False): + """ + Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf`` + is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``. + """ + if check_neg_inf: + x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min) + if check_pos_inf: + x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max) + + +def _olmo_model_forward( + self, + input_ids: torch.LongTensor, + input_embeddings: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + attention_bias: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]] = None, + use_cache: bool = False, + last_logits_only: bool = False, + output_hidden_states: Optional[bool] = None, +): + output_hidden_states = output_hidden_states if output_hidden_states is not None else False + + if past_key_values: + assert len(past_key_values) == self.config.n_layers + + batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2] + if past_key_values is None: + past_length = 0 + else: + past_length = past_key_values[0][0].size(-2) + + # Get embeddings of input. + # shape: (batch_size, seq_len, d_model) + x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings # type: ignore + + if not (self.config.alibi or self.config.rope): + # Get positional embeddings. + # shape: (1, seq_len) + pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0) + # shape: (1, seq_len, d_model) + pos_emb = self.transformer.wpe(pos) # type: ignore + x = pos_emb + x + + # Add input + positional embeddings and apply dropout. + # shape: (batch_size, seq_len, d_model) + x = self.transformer.emb_drop(x) # type: ignore + + # Transform the attention mask into what the blocks expect. + if attention_mask is not None: + # shape: (batch_size, 1, 1, seq_len) + attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :] + attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min + + # Merge attention mask with attention bias. + if attention_bias is not None or attention_mask is not None or self.config.alibi or past_key_values is not None: + if attention_bias is None and self.config.alibi: + attention_bias = self.get_causal_attention_bias( + past_length + seq_len, x.device + ) + self.get_alibi_attention_bias(past_length + seq_len, x.device) + elif attention_bias is None: + attention_bias = self.get_causal_attention_bias(past_length + seq_len, x.device) + elif attention_bias.dtype in (torch.int8, torch.bool): + attention_bias = attention_bias.to(dtype=torch.float) + attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min) + + # Transform to the right shape and data type. + mask_len = seq_len + if attention_mask is not None: + mask_len = attention_mask.shape[-1] + elif past_key_values is not None: + mask_len = past_key_values[0][0].shape[-2] + seq_len + attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float) + + # Add in the masking bias. + if attention_mask is not None: + attention_bias = attention_bias + attention_mask + # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf. + # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead + # it can produce NaNs. + ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False) + + attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None + + # decoder layers + all_hidden_states = [] + + # Apply blocks one-by-one. + if self.config.block_group_size == 1: + for block_idx, block in enumerate(self.transformer.blocks): + if output_hidden_states: + # add hidden states + all_hidden_states.append(x) + + layer_past = None if past_key_values is None else past_key_values[block_idx] + # shape: (batch_size, seq_len, d_model) + x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache) + if attn_key_values is not None: + assert cache is not None + attn_key_values.append(cache) + else: + for group_idx, block_group in enumerate(self.transformer.block_groups): + if output_hidden_states: + # add hidden states + all_hidden_states.append(x) + + layers_past = ( + None + if past_key_values is None + else past_key_values[ + group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size + ] + ) + x, cache = block_group(x, attention_bias=attention_bias, layers_past=layers_past, use_cache=use_cache) + if attn_key_values is not None: + assert cache is not None + attn_key_values.extend(cache) + + if last_logits_only: + # shape: (batch_size, 1, d_model) + x = x[:, -1, :].unsqueeze(1) + + # Apply final layer norm. + # shape: (batch_size, seq_len or 1, d_model) + x = self.transformer.ln_f(x) # type: ignore + if output_hidden_states: + # add final hidden state post-final-layernorm, following HuggingFace's convention + all_hidden_states.append(x) + + # Get logits. + # shape: (batch_size, seq_len or 1, vocab_size) + if self.config.weight_tying: + logits = F.linear(x, self.transformer.wte.weight, None) # type: ignore + else: + logits = self.transformer.ff_out(x) # type: ignore + if self.config.scale_logits: + logits.mul_(1 / math.sqrt(self.config.d_model)) + + return OlmoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type] + + +def _olmo_causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor: + att_bias = torch.triu( + torch.ones(seq_len, seq_len, device=device, dtype=torch.float), + diagonal=1, + ) + att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min) + return att_bias.view(1, 1, seq_len, seq_len) # type: ignore + + +def _olmo_get_causal_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor: + if hasattr(self, "causal_bias") and self.causal_bias.shape[-1] >= seq_len: + return self.causal_bias.to(device) + with torch.autocast(device.type, enabled=False): + causal_bias = _olmo_causal_attention_bias(seq_len, device) + self.register_buffer("causal_bias", causal_bias) + return causal_bias + + +def _olmo_alibi_attention_bias(seq_len: int, config, device: torch.device) -> torch.FloatTensor: + alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, 1, seq_len) + """ + A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities + for the next token *before* normalization via (log) softmax. + """ + # shape: (1, 1, seq_len, seq_len) + alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, seq_len, 1) + alibi_bias.abs_().mul_(-1) + + # shape: (n_heads,) + m = torch.arange(1, config.n_heads + 1, dtype=torch.float, device=device) + m.mul_(config.alibi_bias_max / config.n_heads) + + # shape: (1, n_heads, seq_len, seq_len) + return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1))) # type: ignore + + +def _olmo_get_alibi_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor: + alibi_bias = getattr(self, "alibi_attention_bias", None) + if alibi_bias is not None and alibi_bias.shape[-1] >= seq_len: + if alibi_bias.device != device: + alibi_bias = alibi_bias.to(device) + return alibi_bias + with torch.autocast(device.type, enabled=False): + alibi_bias = _olmo_alibi_attention_bias(seq_len, self.config, device) + self.register_buffer("alibi_attention_bias", alibi_bias) + return alibi_bias + + +def _olmo_get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]: + if ( + hasattr(self, "rope_pos_sin") + and hasattr(self, "rope_pos_cos") + and self.rope_pos_sin.shape[-2] >= seq_len + and self.rope_pos_cos.shape[-2] >= seq_len + ): + return self.rope_pos_sin.to(device)[:, :, :seq_len, :], self.rope_pos_sin.to(device)[:, :, :seq_len, :] + + with torch.autocast(device.type, enabled=False): + dim = self.config.d_model // self.config.n_heads + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim)) + seq = torch.arange(seq_len, device=device, dtype=torch.float) + freqs = torch.einsum("i , j -> i j", seq, inv_freq) + positions = torch.cat((freqs, freqs), dim=-1) + pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :] + + self.register_buffer("rope_pos_sin", pos_sin) + self.register_buffer("rope_pos_cos", pos_cos) + return pos_sin, pos_cos + + +class OLMoModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + # model uses custom cache buffers for storing rotary_embeddings and attention biases. + # these objects are nontracable, replace them with standard torch tensors during export + self._model.model._orig_forward = self._model.model.forward + self._model.model._orig_get_alibi_attention_bias = self._model.model.get_alibi_attention_bias + self._model.model.forward = types.MethodType(_olmo_model_forward, self._model.model) + self._model.model.get_alibi_attention_bias = types.MethodType( + _olmo_get_alibi_attention_bias, self._model.model + ) + self._model.model.get_alibi_attention_bias(self._model.config.max_sequence_length, torch.device("cpu")) + self._model.model.get_causal_attention_bias = types.MethodType( + _olmo_get_causal_attention_bias, self._model.model + ) + self._model.model.get_causal_attention_bias(self._model.config.max_sequence_length, torch.device("cpu")) + for block in self._model.model.transformer.blocks: + block.rotary_emb._orig_get_rotary_embedding = block.rotary_emb.get_rotary_embedding + block.rotary_emb.get_rotary_embedding = types.MethodType(_olmo_get_rotary_embedding, block.rotary_emb) + block.rotary_emb.get_rotary_embedding(self._model.config.max_sequence_length, torch.device("cpu")) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.model.forward = self._model.model._orig_forward + self._model.model.get_alibi_attention_bias = self._model.model._orig_get_alibi_attention_bias + for block in self._model.model.transformer.blocks: + block.rotary_emb.get_rotary_embedding = block.rotary_emb._orig_get_rotary_embedding From 1e6f0b2833e6495dcbb491bdcd5a2d1a9ce870c3 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 25 Mar 2024 18:21:52 +0400 Subject: [PATCH 2/5] add orion --- optimum/exporters/openvino/__main__.py | 2 ++ optimum/exporters/openvino/model_configs.py | 11 ++++++++++- tests/openvino/test_modeling.py | 3 ++- tests/openvino/utils_tests.py | 2 ++ 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 5d6e31ebac..2f0bc1350f 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -203,6 +203,8 @@ def main_export( do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" model_type = config.model_type.replace("_", "-") + if model_type in {"falcon", "mpt"} and trust_remote_code: + trust_remote_code = False if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True elif task not in TasksManager.get_supported_tasks_for_model_type( diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ddb6223951..d7b23d1238 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -75,7 +75,7 @@ def init_model_configs(): @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") -class BaichaunOpenVINOConfig(TextDecoderOnnxConfig): +class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" @@ -471,3 +471,12 @@ class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") +class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index f54305113f..70c6ceab5d 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -524,10 +524,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "stablelm", "starcoder2", "phi", + "internlm2", ) GENERATION_LENGTH = 100 IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") - REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen") + REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "decilm") @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index c95444274e..bb11ad7baa 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -36,6 +36,7 @@ "data2vec_audio": "hf-internal-testing/tiny-random-Data2VecAudioModel", "deberta": "hf-internal-testing/tiny-random-deberta", "deberta_v2": "hf-internal-testing/tiny-random-DebertaV2Model", + "decilm": "katuni4ka/tiny-random-deciml", "deit": "hf-internal-testing/tiny-random-deit", "convnext": "hf-internal-testing/tiny-random-convnext", "distilbert": "hf-internal-testing/tiny-random-distilbert", @@ -50,6 +51,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJModel", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-ibert", + "internlm2": "katuni4ka/tiny-random-internlm2", "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-longt5", "llama": "fxmarty/tiny-llama-fast-tokenizer", From 22141ef980d7f46163000f3d1898e82e02783f06 Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 5 Apr 2024 14:38:34 +0400 Subject: [PATCH 3/5] update tests --- optimum/exporters/openvino/__main__.py | 17 +- optimum/exporters/openvino/model_configs.py | 62 ----- optimum/exporters/openvino/model_patcher.py | 262 +------------------- tests/openvino/test_modeling.py | 3 +- tests/openvino/utils_tests.py | 3 +- 5 files changed, 19 insertions(+), 328 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 2f0bc1350f..856ea6798f 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -202,9 +202,6 @@ def main_export( quantization_config = getattr(config, "quantization_config", None) do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" model_type = config.model_type.replace("_", "-") - - if model_type in {"falcon", "mpt"} and trust_remote_code: - trust_remote_code = False if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True elif task not in TasksManager.get_supported_tasks_for_model_type( @@ -222,6 +219,20 @@ def main_export( ) if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED: loading_kwargs["attn_implementation"] = "eager" + # there are some difference between remote and in library representation of past key values for some models, + # for avoiding confusion we disable remote code for them + if ( + trust_remote_code + and model_type in {"falcon", "mpt", "phi"} + and ("with-past" in task or original_task == "auto") + and not custom_export_configs + ): + logger.warning( + "Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`" + "using default export configuration, `trust_remote_code` will be disabled. " + "Please provide custon export config if you want load model with remote code." + ) + trust_remote_code = False # Patch the modules to export of GPTQ models w/o GPU if do_gptq_patching: diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index d7b23d1238..6f22cf2142 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -35,7 +35,6 @@ ChatGLMModelPatcher, GemmaModelPatcher, MixtralModelPatcher, - OLMoModelPatcher, QwenModelPatcher, ) @@ -403,18 +402,6 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig -@register_in_tasks_manager("olmo", *["text-generation", "text-generation-with-past"], library_name="transformers") -class OLMoOpenVINOConfig(TextDecoderOnnxConfig): - # OLMo does not require position_ids input. - DEFAULT_ONNX_OPSET = 13 - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig - - def patch_model_for_export( - self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None - ) -> "ModelPatcher": - return OLMoModelPatcher(self, model, model_kwargs=model_kwargs) - - @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 @@ -424,55 +411,6 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig -class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): - def __init__( - self, - task: str, - normalized_config: NormalizedTextConfig, - batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], - sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], - random_batch_size_range: Optional[Tuple[int, int]] = None, - random_sequence_length_range: Optional[Tuple[int, int]] = None, - **kwargs, - ): - super().__init__( - task=task, - normalized_config=normalized_config, - batch_size=batch_size, - sequence_length=sequence_length, - random_batch_size_range=random_batch_size_range, - random_sequence_length_range=random_sequence_length_range, - ) - self.num_key_value_heads_per_layer = normalized_config.num_key_value_heads_per_layer - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - past_key_values = [] - - for layer_id in range(self.num_layers): - shape = ( - self.batch_size, - self.num_key_value_heads_per_layer[layer_id], - self.sequence_length, - self.hidden_size // self.num_attention_heads, - ) - past_key_values.append( - ( - self.random_float_tensor(shape, framework=framework, dtype=float_dtype), - self.random_float_tensor(shape, framework=framework, dtype=float_dtype), - ) - ) - return past_key_values - - -@register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers") -class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): - DEFAULT_ONNX_OPSET = 14 - - DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator) - DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig - - @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 9a78a23e9d..bafd467dd4 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -13,9 +13,8 @@ # limitations under the License. import logging as log -import math import types -from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -516,262 +515,3 @@ def __init__( # model has first inference buffers initialization if hasattr(self._model.lm_head, "first_flag"): self._model(torch.ones((1, 10), dtype=torch.int64), torch.ones((1, 10), dtype=torch.int64)) - - -class OlmoOutput(NamedTuple): - logits: torch.FloatTensor - """ - A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities - for the next token *before* normalization via (log) softmax. - """ - - attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] - """ - Attention keys and values from each block. - """ - - hidden_states: Optional[Tuple[torch.Tensor]] - """ - Hidden states from each block. - """ - - -def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False): - """ - Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf`` - is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``. - """ - if check_neg_inf: - x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min) - if check_pos_inf: - x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max) - - -def _olmo_model_forward( - self, - input_ids: torch.LongTensor, - input_embeddings: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - attention_bias: Optional[torch.Tensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]] = None, - use_cache: bool = False, - last_logits_only: bool = False, - output_hidden_states: Optional[bool] = None, -): - output_hidden_states = output_hidden_states if output_hidden_states is not None else False - - if past_key_values: - assert len(past_key_values) == self.config.n_layers - - batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2] - if past_key_values is None: - past_length = 0 - else: - past_length = past_key_values[0][0].size(-2) - - # Get embeddings of input. - # shape: (batch_size, seq_len, d_model) - x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings # type: ignore - - if not (self.config.alibi or self.config.rope): - # Get positional embeddings. - # shape: (1, seq_len) - pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0) - # shape: (1, seq_len, d_model) - pos_emb = self.transformer.wpe(pos) # type: ignore - x = pos_emb + x - - # Add input + positional embeddings and apply dropout. - # shape: (batch_size, seq_len, d_model) - x = self.transformer.emb_drop(x) # type: ignore - - # Transform the attention mask into what the blocks expect. - if attention_mask is not None: - # shape: (batch_size, 1, 1, seq_len) - attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :] - attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min - - # Merge attention mask with attention bias. - if attention_bias is not None or attention_mask is not None or self.config.alibi or past_key_values is not None: - if attention_bias is None and self.config.alibi: - attention_bias = self.get_causal_attention_bias( - past_length + seq_len, x.device - ) + self.get_alibi_attention_bias(past_length + seq_len, x.device) - elif attention_bias is None: - attention_bias = self.get_causal_attention_bias(past_length + seq_len, x.device) - elif attention_bias.dtype in (torch.int8, torch.bool): - attention_bias = attention_bias.to(dtype=torch.float) - attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min) - - # Transform to the right shape and data type. - mask_len = seq_len - if attention_mask is not None: - mask_len = attention_mask.shape[-1] - elif past_key_values is not None: - mask_len = past_key_values[0][0].shape[-2] + seq_len - attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float) - - # Add in the masking bias. - if attention_mask is not None: - attention_bias = attention_bias + attention_mask - # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf. - # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead - # it can produce NaNs. - ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False) - - attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None - - # decoder layers - all_hidden_states = [] - - # Apply blocks one-by-one. - if self.config.block_group_size == 1: - for block_idx, block in enumerate(self.transformer.blocks): - if output_hidden_states: - # add hidden states - all_hidden_states.append(x) - - layer_past = None if past_key_values is None else past_key_values[block_idx] - # shape: (batch_size, seq_len, d_model) - x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache) - if attn_key_values is not None: - assert cache is not None - attn_key_values.append(cache) - else: - for group_idx, block_group in enumerate(self.transformer.block_groups): - if output_hidden_states: - # add hidden states - all_hidden_states.append(x) - - layers_past = ( - None - if past_key_values is None - else past_key_values[ - group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size - ] - ) - x, cache = block_group(x, attention_bias=attention_bias, layers_past=layers_past, use_cache=use_cache) - if attn_key_values is not None: - assert cache is not None - attn_key_values.extend(cache) - - if last_logits_only: - # shape: (batch_size, 1, d_model) - x = x[:, -1, :].unsqueeze(1) - - # Apply final layer norm. - # shape: (batch_size, seq_len or 1, d_model) - x = self.transformer.ln_f(x) # type: ignore - if output_hidden_states: - # add final hidden state post-final-layernorm, following HuggingFace's convention - all_hidden_states.append(x) - - # Get logits. - # shape: (batch_size, seq_len or 1, vocab_size) - if self.config.weight_tying: - logits = F.linear(x, self.transformer.wte.weight, None) # type: ignore - else: - logits = self.transformer.ff_out(x) # type: ignore - if self.config.scale_logits: - logits.mul_(1 / math.sqrt(self.config.d_model)) - - return OlmoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type] - - -def _olmo_causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor: - att_bias = torch.triu( - torch.ones(seq_len, seq_len, device=device, dtype=torch.float), - diagonal=1, - ) - att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min) - return att_bias.view(1, 1, seq_len, seq_len) # type: ignore - - -def _olmo_get_causal_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor: - if hasattr(self, "causal_bias") and self.causal_bias.shape[-1] >= seq_len: - return self.causal_bias.to(device) - with torch.autocast(device.type, enabled=False): - causal_bias = _olmo_causal_attention_bias(seq_len, device) - self.register_buffer("causal_bias", causal_bias) - return causal_bias - - -def _olmo_alibi_attention_bias(seq_len: int, config, device: torch.device) -> torch.FloatTensor: - alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, 1, seq_len) - """ - A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities - for the next token *before* normalization via (log) softmax. - """ - # shape: (1, 1, seq_len, seq_len) - alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, seq_len, 1) - alibi_bias.abs_().mul_(-1) - - # shape: (n_heads,) - m = torch.arange(1, config.n_heads + 1, dtype=torch.float, device=device) - m.mul_(config.alibi_bias_max / config.n_heads) - - # shape: (1, n_heads, seq_len, seq_len) - return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1))) # type: ignore - - -def _olmo_get_alibi_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor: - alibi_bias = getattr(self, "alibi_attention_bias", None) - if alibi_bias is not None and alibi_bias.shape[-1] >= seq_len: - if alibi_bias.device != device: - alibi_bias = alibi_bias.to(device) - return alibi_bias - with torch.autocast(device.type, enabled=False): - alibi_bias = _olmo_alibi_attention_bias(seq_len, self.config, device) - self.register_buffer("alibi_attention_bias", alibi_bias) - return alibi_bias - - -def _olmo_get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]: - if ( - hasattr(self, "rope_pos_sin") - and hasattr(self, "rope_pos_cos") - and self.rope_pos_sin.shape[-2] >= seq_len - and self.rope_pos_cos.shape[-2] >= seq_len - ): - return self.rope_pos_sin.to(device)[:, :, :seq_len, :], self.rope_pos_sin.to(device)[:, :, :seq_len, :] - - with torch.autocast(device.type, enabled=False): - dim = self.config.d_model // self.config.n_heads - inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim)) - seq = torch.arange(seq_len, device=device, dtype=torch.float) - freqs = torch.einsum("i , j -> i j", seq, inv_freq) - positions = torch.cat((freqs, freqs), dim=-1) - pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :] - - self.register_buffer("rope_pos_sin", pos_sin) - self.register_buffer("rope_pos_cos", pos_cos) - return pos_sin, pos_cos - - -class OLMoModelPatcher(DecoderModelPatcher): - def __enter__(self): - super().__enter__() - # model uses custom cache buffers for storing rotary_embeddings and attention biases. - # these objects are nontracable, replace them with standard torch tensors during export - self._model.model._orig_forward = self._model.model.forward - self._model.model._orig_get_alibi_attention_bias = self._model.model.get_alibi_attention_bias - self._model.model.forward = types.MethodType(_olmo_model_forward, self._model.model) - self._model.model.get_alibi_attention_bias = types.MethodType( - _olmo_get_alibi_attention_bias, self._model.model - ) - self._model.model.get_alibi_attention_bias(self._model.config.max_sequence_length, torch.device("cpu")) - self._model.model.get_causal_attention_bias = types.MethodType( - _olmo_get_causal_attention_bias, self._model.model - ) - self._model.model.get_causal_attention_bias(self._model.config.max_sequence_length, torch.device("cpu")) - for block in self._model.model.transformer.blocks: - block.rotary_emb._orig_get_rotary_embedding = block.rotary_emb.get_rotary_embedding - block.rotary_emb.get_rotary_embedding = types.MethodType(_olmo_get_rotary_embedding, block.rotary_emb) - block.rotary_emb.get_rotary_embedding(self._model.config.max_sequence_length, torch.device("cpu")) - - def __exit__(self, exc_type, exc_value, traceback): - super().__exit__(exc_type, exc_value, traceback) - self._model.model.forward = self._model.model._orig_forward - self._model.model.get_alibi_attention_bias = self._model.model._orig_get_alibi_attention_bias - for block in self._model.model.transformer.blocks: - block.rotary_emb.get_rotary_embedding = block.rotary_emb._orig_get_rotary_embedding diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 70c6ceab5d..32fc255a1f 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -525,10 +525,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "starcoder2", "phi", "internlm2", + "orion", ) GENERATION_LENGTH = 100 IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") - REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "decilm") + REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "olmo", "orion") @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index bb11ad7baa..e7f62f1f61 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -36,7 +36,6 @@ "data2vec_audio": "hf-internal-testing/tiny-random-Data2VecAudioModel", "deberta": "hf-internal-testing/tiny-random-deberta", "deberta_v2": "hf-internal-testing/tiny-random-DebertaV2Model", - "decilm": "katuni4ka/tiny-random-deciml", "deit": "hf-internal-testing/tiny-random-deit", "convnext": "hf-internal-testing/tiny-random-convnext", "distilbert": "hf-internal-testing/tiny-random-distilbert", @@ -71,6 +70,8 @@ "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", "mt5": "stas/mt5-tiny-random", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", + "olmo": "katuni4ka/tiny-random-olmo", + "orion": "katuni4ka/tiny-random-orion", "pegasus": "hf-internal-testing/tiny-random-pegasus", "pix2struct": "fxmarty/pix2struct-tiny-random", "phi": "echarlaix/tiny-random-PhiForCausalLM", From 2e5a791eb7efd5ba3fc30d10691fefedae2a41f3 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:20:57 +0200 Subject: [PATCH 4/5] Update optimum/exporters/openvino/__main__.py --- optimum/exporters/openvino/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 856ea6798f..bf45ab7e75 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -230,7 +230,7 @@ def main_export( logger.warning( "Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`" "using default export configuration, `trust_remote_code` will be disabled. " - "Please provide custon export config if you want load model with remote code." + "Please provide custom export config if you want load model with remote code." ) trust_remote_code = False From be1a0d14cc160de1ddec0021f81ed72604acf721 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:21:02 +0200 Subject: [PATCH 5/5] Update optimum/exporters/openvino/__main__.py --- optimum/exporters/openvino/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index bf45ab7e75..dbea798f75 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -228,7 +228,7 @@ def main_export( and not custom_export_configs ): logger.warning( - "Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`" + f"Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`" "using default export configuration, `trust_remote_code` will be disabled. " "Please provide custom export config if you want load model with remote code." )