From 6d0e3345e6b2d5e022d48c525180399a55c713c4 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Thu, 21 Mar 2024 09:35:40 +0400
Subject: [PATCH 1/5] refactor OVModelForCausalLM class

---
 optimum/intel/openvino/modeling_decoder.py | 85 ++++++++--------------
 1 file changed, 31 insertions(+), 54 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 10f0359a24..1622e4750f 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -120,6 +120,7 @@ def __init__(
         self._original_model = self.model.clone()  # keep original model for serialization
         self._pkv_precision = Type.f32
         self.next_beam_idx = None
+        self.past_len = 0
         self.update_pkv_precision()
         if self.is_dynamic:
             self.model = self._reshape(self.model, -1, -1)
@@ -356,19 +357,21 @@ def prepare_inputs(
         position_ids: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> Dict:
-        if self.use_cache and past_key_values is not None:
-            input_ids = input_ids[:, -1:]
 
         batch_size = input_ids.shape[0]
         if self.config.model_type == "bloom":
             batch_size *= self.config.num_attention_heads
 
         inputs = {}
-        past_len = 0
         if not self.stateful:
             if past_key_values is not None:
                 if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
-                    past_len = past_key_values[0][1].shape[-2]
+                    seq_len_dim = -2
+                    if self.config.model_type == "chatglm":
+                        seq_len_dim = 0
+                    elif self.config.model_type == "qwen":
+                        seq_len_dim = 1
+                    self.past_len = past_key_values[0][1].shape[seq_len_dim]
                     if self._pkv_precision == Type.bf16:
                         # numpy does not support bf16, pretending f16, should change to bf16
                         past_key_values = tuple(
@@ -382,13 +385,14 @@ def prepare_inputs(
                             past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer
                         )
                 else:
-                    past_len = past_key_values[0].shape[-2]
+                    self.past_len = past_key_values[0].shape[-2]
 
                 # Add the past_key_values to the decoder inputs
                 inputs = dict(zip(self.key_value_input_names, past_key_values))
 
             # Create empty past_key_values for decoder_with_past first generation step
             elif self.use_cache:
+                self.past_len = 0
                 for input_name in self.key_value_input_names:
                     model_inputs = self.model.input(input_name)
                     shape = model_inputs.get_partial_shape()
@@ -411,6 +415,7 @@ def prepare_inputs(
                 # Set initial value for the next beam_idx input that will be used at the current iteration
                 # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
                 self.next_beam_idx = np.arange(batch_size, dtype=int)
+                self.past_len = 0
 
         inputs["input_ids"] = np.array(input_ids)
         # Add the attention_mask inputs when needed
@@ -419,7 +424,7 @@ def prepare_inputs(
                 attention_mask = np.array(attention_mask)
             else:
                 attention_mask = np.ones(
-                    (input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype
+                    (input_ids.shape[0], input_ids.shape[1] + self.past_len), dtype=inputs["input_ids"].dtype
                 )
 
         if "attention_mask" in self.input_names:
@@ -470,6 +475,7 @@ def forward(
             # the first condition at the function beginning above.
             # It should be something that is not None and it should be True when converted to Boolean.
             past_key_values = ((),)
+            self.past_len += input_ids.shape[1]
 
         if not self.stateful:
             if self.use_cache:
@@ -480,24 +486,38 @@ def forward(
                     past_key_values = tuple(
                         past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv)
                     )
+                self.past_len += input_ids.shape[1]
             else:
                 past_key_values = None
+                self.past_len = 0
 
         return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values)
 
-    # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation
+    # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
         attention_mask = kwargs.get("attention_mask", None)
         use_cache = kwargs.get("use_cache", None)
 
+        if past_key_values is not None:
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - self.past_len) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif self.past_len < input_ids.shape[1]:
+                input_ids = input_ids[:, self.past_len:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens
         position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
+        if attention_mask is not None and position_ids is None and "position_ids" in self.input_names:
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
+                position_ids = position_ids[:, -input_ids.shape[1]:]
 
         return {
             "input_ids": input_ids,
@@ -573,10 +593,6 @@ def _from_pretrained(
         model_type = config.model_type.replace("_", "-")
         if model_type == "bloom":
             init_cls = OVBloomForCausalLM
-        elif model_type == "mpt":
-            init_cls = OVMPTForCausalLM
-        elif model_type == "opt":
-            init_cls = OVOPTForCausalLM
         elif model_type == "gpt-bigcode":
             init_cls = OVGPTBigCodeForCausalLM
         else:
@@ -630,22 +646,13 @@ def _from_pretrained(
 class OVBloomForCausalLM(OVModelForCausalLM):
     # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
-        attention_mask = kwargs.get("attention_mask", None)
-        use_cache = kwargs.get("use_cache", None)
-
         # only last token for input_ids if past is not None
         if past_key_values and not self.stateful:
             # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
             if past_key_values[0][0].shape[0] == input_ids.shape[0]:
                 past_key_values = self._convert_to_bloom_cache(past_key_values)
-
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-            "position_ids": None,
-            "attention_mask": attention_mask,
-        }
+        
+        return super().prepare_inputs_for_generation(self, input_ids, past_key_values=past_key_values, **kwargs)
 
     # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM._reorder_cache
     def _reorder_cache(
@@ -712,36 +719,6 @@ def _convert_to_standard_cache(
         )
 
 
-class OVOPTForCausalLM(OVModelForCausalLM):
-    # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
-        attention_mask = kwargs.get("attention_mask", None)
-        use_cache = kwargs.get("use_cache", None)
-
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-            "position_ids": None,
-            "attention_mask": attention_mask,
-        }
-
-
-class OVMPTForCausalLM(OVModelForCausalLM):
-    # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
-        attention_mask = kwargs.get("attention_mask", None)
-        use_cache = kwargs.get("use_cache", None)
-
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-            "position_ids": None,
-            "attention_mask": attention_mask,
-        }
-
-
 class OVGPTBigCodeForCausalLM(OVModelForCausalLM):
     # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache
     def _reorder_cache(

From b69c3cdf6e7046a49df42b837f3aa77467aa31d4 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Thu, 21 Mar 2024 09:39:47 +0400
Subject: [PATCH 2/5] rework prepare_inputs_for_generation for
 OVModelForCausalLM

---
 optimum/intel/openvino/modeling_decoder.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 1622e4750f..53d719d718 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -357,7 +357,6 @@ def prepare_inputs(
         position_ids: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> Dict:
-
         batch_size = input_ids.shape[0]
         if self.config.model_type == "bloom":
             batch_size *= self.config.num_attention_heads
@@ -509,7 +508,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif self.past_len < input_ids.shape[1]:
-                input_ids = input_ids[:, self.past_len:]
+                input_ids = input_ids[:, self.past_len :]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None and "position_ids" in self.input_names:
@@ -517,7 +516,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
+                position_ids = position_ids[:, -input_ids.shape[1] :]
 
         return {
             "input_ids": input_ids,
@@ -651,8 +650,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
             if past_key_values[0][0].shape[0] == input_ids.shape[0]:
                 past_key_values = self._convert_to_bloom_cache(past_key_values)
-        
-        return super().prepare_inputs_for_generation(self, input_ids, past_key_values=past_key_values, **kwargs)
+        return super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, **kwargs)
 
     # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM._reorder_cache
     def _reorder_cache(

From f70a92670075f6dcce056de78261cd78e953194f Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 29 Mar 2024 11:01:23 +0400
Subject: [PATCH 3/5] refactoring

---
 optimum/intel/openvino/modeling_decoder.py | 48 +++++++++++++---------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 53d719d718..f403b76eca 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -120,7 +120,7 @@ def __init__(
         self._original_model = self.model.clone()  # keep original model for serialization
         self._pkv_precision = Type.f32
         self.next_beam_idx = None
-        self.past_len = 0
+        self._past_length = 0
         self.update_pkv_precision()
         if self.is_dynamic:
             self.model = self._reshape(self.model, -1, -1)
@@ -365,12 +365,6 @@ def prepare_inputs(
         if not self.stateful:
             if past_key_values is not None:
                 if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
-                    seq_len_dim = -2
-                    if self.config.model_type == "chatglm":
-                        seq_len_dim = 0
-                    elif self.config.model_type == "qwen":
-                        seq_len_dim = 1
-                    self.past_len = past_key_values[0][1].shape[seq_len_dim]
                     if self._pkv_precision == Type.bf16:
                         # numpy does not support bf16, pretending f16, should change to bf16
                         past_key_values = tuple(
@@ -383,15 +377,13 @@ def prepare_inputs(
                         past_key_values = tuple(
                             past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer
                         )
-                else:
-                    self.past_len = past_key_values[0].shape[-2]
 
                 # Add the past_key_values to the decoder inputs
                 inputs = dict(zip(self.key_value_input_names, past_key_values))
 
             # Create empty past_key_values for decoder_with_past first generation step
             elif self.use_cache:
-                self.past_len = 0
+                past_len = 0
                 for input_name in self.key_value_input_names:
                     model_inputs = self.model.input(input_name)
                     shape = model_inputs.get_partial_shape()
@@ -414,7 +406,8 @@ def prepare_inputs(
                 # Set initial value for the next beam_idx input that will be used at the current iteration
                 # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
                 self.next_beam_idx = np.arange(batch_size, dtype=int)
-                self.past_len = 0
+                self._past_length = 0
+        past_len = self._get_past_length(past_key_values)
 
         inputs["input_ids"] = np.array(input_ids)
         # Add the attention_mask inputs when needed
@@ -423,7 +416,7 @@ def prepare_inputs(
                 attention_mask = np.array(attention_mask)
             else:
                 attention_mask = np.ones(
-                    (input_ids.shape[0], input_ids.shape[1] + self.past_len), dtype=inputs["input_ids"].dtype
+                    (input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype
                 )
 
         if "attention_mask" in self.input_names:
@@ -436,7 +429,7 @@ def prepare_inputs(
                 position_ids = np.cumsum(attention_mask, axis=1) - 1
                 position_ids[attention_mask == 0] = 1
                 if past_key_values:
-                    position_ids = np.expand_dims(position_ids[:, -1], axis=-1)
+                    position_ids = np.expand_dims(position_ids[:, -input_ids.shape[1] :], axis=-1)
 
             inputs["position_ids"] = position_ids
 
@@ -474,7 +467,7 @@ def forward(
             # the first condition at the function beginning above.
             # It should be something that is not None and it should be True when converted to Boolean.
             past_key_values = ((),)
-            self.past_len += input_ids.shape[1]
+            self._past_length += input_ids.shape[1]
 
         if not self.stateful:
             if self.use_cache:
@@ -485,10 +478,8 @@ def forward(
                     past_key_values = tuple(
                         past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv)
                     )
-                self.past_len += input_ids.shape[1]
             else:
                 past_key_values = None
-                self.past_len = 0
 
         return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values)
 
@@ -499,16 +490,17 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
         use_cache = kwargs.get("use_cache", None)
 
         if past_key_values is not None:
+            past_len = self._get_past_length(past_key_values)
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - self.past_len) :]
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
-            elif self.past_len < input_ids.shape[1]:
-                input_ids = input_ids[:, self.past_len :]
+            elif past_len < input_ids.shape[1]:
+                input_ids = input_ids[:, past_len:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None and "position_ids" in self.input_names:
@@ -526,6 +518,24 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             "attention_mask": attention_mask,
         }
 
+    def _get_past_length(self, past_key_values=None):
+        if past_key_values is None:
+            return 0
+        if self.stateful:
+            return self._past_length
+        if self.config.model_type in MULTI_QUERY_ATTN_MODELS:
+            return past_key_values[0].shape[-2]
+        seq_length_dim = -2
+        if self.config.model_type == "chatglm":
+            seq_length_dim = 0
+        elif self.config.model_type == "qwen":
+            seq_length_dim = 1
+        # input is tuple of pairs
+        if isinstance(past_key_values[0], (tuple, list)):
+            return past_key_values[0][1].shape[seq_length_dim]
+        # past key values comes after flattening
+        return past_key_values[1].shape[seq_length_dim]
+
     # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache
     def _reorder_cache(
         self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor

From 9aa21b02c41790bb654e6e85c91fbea7b01de39b Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 29 Mar 2024 11:03:32 +0400
Subject: [PATCH 4/5] Apply suggestions from code review

---
 optimum/intel/openvino/modeling_decoder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index f403b76eca..fbb55aea41 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -383,7 +383,6 @@ def prepare_inputs(
 
             # Create empty past_key_values for decoder_with_past first generation step
             elif self.use_cache:
-                past_len = 0
                 for input_name in self.key_value_input_names:
                     model_inputs = self.model.input(input_name)
                     shape = model_inputs.get_partial_shape()

From 217746129450247e2e627fbd2c96ae5bb22b900a Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Tue, 2 Apr 2024 17:29:18 +0400
Subject: [PATCH 5/5] fix position ids and add tests

---
 optimum/intel/openvino/modeling_decoder.py | 2 +-
 tests/openvino/test_modeling.py            | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index fbb55aea41..4b156eda9e 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -428,7 +428,7 @@ def prepare_inputs(
                 position_ids = np.cumsum(attention_mask, axis=1) - 1
                 position_ids[attention_mask == 0] = 1
                 if past_key_values:
-                    position_ids = np.expand_dims(position_ids[:, -input_ids.shape[1] :], axis=-1)
+                    position_ids = position_ids[:, -input_ids.shape[1] :]
 
             inputs["position_ids"] = position_ids
 
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 65094ae221..f54305113f 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -632,6 +632,11 @@ def test_multiple_inputs(self, model_arch):
         outputs = model.generate(**tokens, generation_config=generation_config)
         self.assertIsInstance(outputs, torch.Tensor)
         self.assertEqual(outputs.shape[0], 3)
+        # test that generation result is reproducible
+        outputs2 = model.generate(**tokens, generation_config=generation_config)
+        self.assertIsInstance(outputs2, torch.Tensor)
+        self.assertEqual(outputs2.shape[0], 3)
+        self.assertTrue(torch.allclose(outputs2, outputs))
         del model
         gc.collect()