Add support decilm (#899)

eaidova · web-flow · commit fe77316c5a25 · 2024-09-19T10:07:19.000+02:00
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -40,6 +40,7 @@ Here is the list of the supported architectures :
 - Data2VecVision
 - Deberta
 - Deberta-v2
+- DeciLM
 - Deit
 - DistilBert
 - Electra
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -53,6 +53,7 @@
     ChatGLMModelPatcher,
     CodeGenModelPatcher,
     DBRXModelPatcher,
+    DeciLMModelPatcher,
     FalconModelPatcher,
     Gemma2ModelPatcher,
     GptNeoxJapaneseModelPatcher,
@@ -1018,3 +1019,57 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return Gemma2ModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        random_sequence_length_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            random_batch_size_range=random_batch_size_range,
+            random_sequence_length_range=random_sequence_length_range,
+        )
+        self.num_key_value_heads_per_layer = normalized_config.num_key_value_heads_per_layer
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        past_key_values = []
+
+        for layer_id in range(self.num_layers):
+            shape = (
+                self.batch_size,
+                self.num_key_value_heads_per_layer[layer_id],
+                self.sequence_length,
+                self.hidden_size // self.num_attention_heads,
+            )
+            past_key_values.append(
+                (
+                    self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+                    self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+                )
+            )
+        return past_key_values
+
+
+@register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return DeciLMModelPatcher(self, model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -2467,3 +2467,134 @@ def patched_forward(*args, **kwargs):
             return outputs
 
         self.patched_forward = patched_forward
+
+
+def _decilm_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # decilm contains bug in attention calculation for case if past key values is not None
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+        """Applies Rotary Position Embedding to the query and key tensors.
+
+        Args:
+            q (`torch.Tensor`): The query tensor.
+            k (`torch.Tensor`): The key tensor.
+            cos (`torch.Tensor`): The cosine part of the rotary embedding.
+            sin (`torch.Tensor`): The sine part of the rotary embedding.
+            position_ids (`torch.Tensor`):
+                The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+                used to pass offsetted position ids when working with a KV-cache.
+            unsqueeze_dim (`int`, *optional*, defaults to 1):
+                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+        Returns:
+            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+        """
+        cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+        sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        """
+        This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+        num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+        """
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep == 1:
+            return hidden_states
+        hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+    bsz, q_len, _ = hidden_states.size()
+    if self.pretraining_tp > 1:
+        key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
+        query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+        query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
+        query_states = torch.cat(query_states, dim=-1)
+
+        key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
+        key_states = torch.cat(key_states, dim=-1)
+
+        value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
+        value_states = torch.cat(value_states, dim=-1)
+
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    attn_output = F.scaled_dot_product_attention(
+        query_states, key_states, value_states, is_causal=attention_mask is None, attn_mask=attention_mask
+    )
+
+    # modified, in original implementation .transpose(1, 2) missed
+    attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, self.hidden_size)
+
+    if self.pretraining_tp > 1:
+        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
+        o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
+        attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
+    else:
+        attn_output = self.o_proj(attn_output)
+
+    attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+class DeciLMModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+
+        for layer in self._model.model.layers:
+            layer.self_attn._orig_forward = layer.self_attn.forward
+            layer.self_attn.forward = types.MethodType(_decilm_attn_forward, layer.self_attn)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+
+        for layer in self._model.model.layers:
+            layer.self_attn.forward = layer.self_attn._orig_forward
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
@@ -219,7 +219,6 @@ def _compile_model(
         ov_config: Optional[Dict[str, str]] = None,
         model_save_dir: Union[str, Path] = None,
     ):
-        logger.info(f"Compiling the model to {device} ...")
         if isinstance(model, str):
             model = Path(model)
         ov_config = ov_config or {}
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -754,6 +754,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "internlm",
         "jais",
         "glm4",
+        "decilm",
     )
 
     if is_transformers_version(">=", "4.40.0"):
@@ -791,6 +792,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "arctic",
         "glm4",
         "exaone",
+        "decilm",
     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -44,6 +44,7 @@
     "dbrx": "katuni4ka/tiny-random-dbrx",
     "deberta": "hf-internal-testing/tiny-random-deberta",
     "deberta_v2": "hf-internal-testing/tiny-random-DebertaV2Model",
+    "decilm": "katuni4ka/tiny-random-decilm",
     "deit": "hf-internal-testing/tiny-random-DeiTModel",
     "convnext": "hf-internal-testing/tiny-random-convnext",
     "convnextv2": "hf-internal-testing/tiny-random-ConvNextV2Model",

Original file line number	Diff line number	Diff line change
`@@ -754,6 +754,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):`
`754`	`754`	`"internlm",`
`755`	`755`	`"jais",`
`756`	`756`	`"glm4",`
	`757`	`+ "decilm",`
`757`	`758`	`)`
`758`	`759`
`759`	`760`	`if is_transformers_version(">=", "4.40.0"):`
`@@ -791,6 +792,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):`
`791`	`792`	`"arctic",`
`792`	`793`	`"glm4",`
`793`	`794`	`"exaone",`
	`795`	`+ "decilm",`
`794`	`796`	`)`
`795`	`797`
`796`	`798`	`@parameterized.expand(SUPPORTED_ARCHITECTURES)`