From 6d0e3345e6b2d5e022d48c525180399a55c713c4 Mon Sep 17 00:00:00 2001 From: eaidova Date: Thu, 21 Mar 2024 09:35:40 +0400 Subject: [PATCH 1/5] refactor OVModelForCausalLM class --- optimum/intel/openvino/modeling_decoder.py | 85 ++++++++-------------- 1 file changed, 31 insertions(+), 54 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 10f0359a24..1622e4750f 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -120,6 +120,7 @@ def __init__( self._original_model = self.model.clone() # keep original model for serialization self._pkv_precision = Type.f32 self.next_beam_idx = None + self.past_len = 0 self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) @@ -356,19 +357,21 @@ def prepare_inputs( position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> Dict: - if self.use_cache and past_key_values is not None: - input_ids = input_ids[:, -1:] batch_size = input_ids.shape[0] if self.config.model_type == "bloom": batch_size *= self.config.num_attention_heads inputs = {} - past_len = 0 if not self.stateful: if past_key_values is not None: if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: - past_len = past_key_values[0][1].shape[-2] + seq_len_dim = -2 + if self.config.model_type == "chatglm": + seq_len_dim = 0 + elif self.config.model_type == "qwen": + seq_len_dim = 1 + self.past_len = past_key_values[0][1].shape[seq_len_dim] if self._pkv_precision == Type.bf16: # numpy does not support bf16, pretending f16, should change to bf16 past_key_values = tuple( @@ -382,13 +385,14 @@ def prepare_inputs( past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer ) else: - past_len = past_key_values[0].shape[-2] + self.past_len = past_key_values[0].shape[-2] # Add the past_key_values to the decoder inputs inputs = dict(zip(self.key_value_input_names, past_key_values)) # Create empty past_key_values for decoder_with_past first generation step elif self.use_cache: + self.past_len = 0 for input_name in self.key_value_input_names: model_inputs = self.model.input(input_name) shape = model_inputs.get_partial_shape() @@ -411,6 +415,7 @@ def prepare_inputs( # Set initial value for the next beam_idx input that will be used at the current iteration # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used self.next_beam_idx = np.arange(batch_size, dtype=int) + self.past_len = 0 inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed @@ -419,7 +424,7 @@ def prepare_inputs( attention_mask = np.array(attention_mask) else: attention_mask = np.ones( - (input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype + (input_ids.shape[0], input_ids.shape[1] + self.past_len), dtype=inputs["input_ids"].dtype ) if "attention_mask" in self.input_names: @@ -470,6 +475,7 @@ def forward( # the first condition at the function beginning above. # It should be something that is not None and it should be True when converted to Boolean. past_key_values = ((),) + self.past_len += input_ids.shape[1] if not self.stateful: if self.use_cache: @@ -480,24 +486,38 @@ def forward( past_key_values = tuple( past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) ) + self.past_len += input_ids.shape[1] else: past_key_values = None + self.past_len = 0 return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation + # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) + if past_key_values is not None: + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - self.past_len) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif self.past_len < input_ids.shape[1]: + input_ids = input_ids[:, self.past_len:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: + if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: # create position_ids on the fly for batch generation position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) + position_ids = position_ids[:, -input_ids.shape[1]:] return { "input_ids": input_ids, @@ -573,10 +593,6 @@ def _from_pretrained( model_type = config.model_type.replace("_", "-") if model_type == "bloom": init_cls = OVBloomForCausalLM - elif model_type == "mpt": - init_cls = OVMPTForCausalLM - elif model_type == "opt": - init_cls = OVOPTForCausalLM elif model_type == "gpt-bigcode": init_cls = OVGPTBigCodeForCausalLM else: @@ -630,22 +646,13 @@ def _from_pretrained( class OVBloomForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - # only last token for input_ids if past is not None if past_key_values and not self.stateful: # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed if past_key_values[0][0].shape[0] == input_ids.shape[0]: past_key_values = self._convert_to_bloom_cache(past_key_values) - - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": None, - "attention_mask": attention_mask, - } + + return super().prepare_inputs_for_generation(self, input_ids, past_key_values=past_key_values, **kwargs) # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM._reorder_cache def _reorder_cache( @@ -712,36 +719,6 @@ def _convert_to_standard_cache( ) -class OVOPTForCausalLM(OVModelForCausalLM): - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": None, - "attention_mask": attention_mask, - } - - -class OVMPTForCausalLM(OVModelForCausalLM): - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": None, - "attention_mask": attention_mask, - } - - class OVGPTBigCodeForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache def _reorder_cache( From b69c3cdf6e7046a49df42b837f3aa77467aa31d4 Mon Sep 17 00:00:00 2001 From: eaidova Date: Thu, 21 Mar 2024 09:39:47 +0400 Subject: [PATCH 2/5] rework prepare_inputs_for_generation for OVModelForCausalLM --- optimum/intel/openvino/modeling_decoder.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 1622e4750f..53d719d718 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -357,7 +357,6 @@ def prepare_inputs( position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> Dict: - batch_size = input_ids.shape[0] if self.config.model_type == "bloom": batch_size *= self.config.num_attention_heads @@ -509,7 +508,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard # input_ids based on the past_length. elif self.past_len < input_ids.shape[1]: - input_ids = input_ids[:, self.past_len:] + input_ids = input_ids[:, self.past_len :] # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: @@ -517,7 +516,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1]:] + position_ids = position_ids[:, -input_ids.shape[1] :] return { "input_ids": input_ids, @@ -651,8 +650,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed if past_key_values[0][0].shape[0] == input_ids.shape[0]: past_key_values = self._convert_to_bloom_cache(past_key_values) - - return super().prepare_inputs_for_generation(self, input_ids, past_key_values=past_key_values, **kwargs) + return super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, **kwargs) # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM._reorder_cache def _reorder_cache( From f70a92670075f6dcce056de78261cd78e953194f Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 29 Mar 2024 11:01:23 +0400 Subject: [PATCH 3/5] refactoring --- optimum/intel/openvino/modeling_decoder.py | 48 +++++++++++++--------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 53d719d718..f403b76eca 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -120,7 +120,7 @@ def __init__( self._original_model = self.model.clone() # keep original model for serialization self._pkv_precision = Type.f32 self.next_beam_idx = None - self.past_len = 0 + self._past_length = 0 self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) @@ -365,12 +365,6 @@ def prepare_inputs( if not self.stateful: if past_key_values is not None: if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: - seq_len_dim = -2 - if self.config.model_type == "chatglm": - seq_len_dim = 0 - elif self.config.model_type == "qwen": - seq_len_dim = 1 - self.past_len = past_key_values[0][1].shape[seq_len_dim] if self._pkv_precision == Type.bf16: # numpy does not support bf16, pretending f16, should change to bf16 past_key_values = tuple( @@ -383,15 +377,13 @@ def prepare_inputs( past_key_values = tuple( past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer ) - else: - self.past_len = past_key_values[0].shape[-2] # Add the past_key_values to the decoder inputs inputs = dict(zip(self.key_value_input_names, past_key_values)) # Create empty past_key_values for decoder_with_past first generation step elif self.use_cache: - self.past_len = 0 + past_len = 0 for input_name in self.key_value_input_names: model_inputs = self.model.input(input_name) shape = model_inputs.get_partial_shape() @@ -414,7 +406,8 @@ def prepare_inputs( # Set initial value for the next beam_idx input that will be used at the current iteration # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used self.next_beam_idx = np.arange(batch_size, dtype=int) - self.past_len = 0 + self._past_length = 0 + past_len = self._get_past_length(past_key_values) inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed @@ -423,7 +416,7 @@ def prepare_inputs( attention_mask = np.array(attention_mask) else: attention_mask = np.ones( - (input_ids.shape[0], input_ids.shape[1] + self.past_len), dtype=inputs["input_ids"].dtype + (input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype ) if "attention_mask" in self.input_names: @@ -436,7 +429,7 @@ def prepare_inputs( position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 if past_key_values: - position_ids = np.expand_dims(position_ids[:, -1], axis=-1) + position_ids = np.expand_dims(position_ids[:, -input_ids.shape[1] :], axis=-1) inputs["position_ids"] = position_ids @@ -474,7 +467,7 @@ def forward( # the first condition at the function beginning above. # It should be something that is not None and it should be True when converted to Boolean. past_key_values = ((),) - self.past_len += input_ids.shape[1] + self._past_length += input_ids.shape[1] if not self.stateful: if self.use_cache: @@ -485,10 +478,8 @@ def forward( past_key_values = tuple( past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) ) - self.past_len += input_ids.shape[1] else: past_key_values = None - self.past_len = 0 return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) @@ -499,16 +490,17 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg use_cache = kwargs.get("use_cache", None) if past_key_values is not None: + past_len = self._get_past_length(past_key_values) # Keep only the unprocessed tokens: # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as # input) if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - self.past_len) :] + input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :] # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard # input_ids based on the past_length. - elif self.past_len < input_ids.shape[1]: - input_ids = input_ids[:, self.past_len :] + elif past_len < input_ids.shape[1]: + input_ids = input_ids[:, past_len:] # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: @@ -526,6 +518,24 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "attention_mask": attention_mask, } + def _get_past_length(self, past_key_values=None): + if past_key_values is None: + return 0 + if self.stateful: + return self._past_length + if self.config.model_type in MULTI_QUERY_ATTN_MODELS: + return past_key_values[0].shape[-2] + seq_length_dim = -2 + if self.config.model_type == "chatglm": + seq_length_dim = 0 + elif self.config.model_type == "qwen": + seq_length_dim = 1 + # input is tuple of pairs + if isinstance(past_key_values[0], (tuple, list)): + return past_key_values[0][1].shape[seq_length_dim] + # past key values comes after flattening + return past_key_values[1].shape[seq_length_dim] + # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor From 9aa21b02c41790bb654e6e85c91fbea7b01de39b Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 29 Mar 2024 11:03:32 +0400 Subject: [PATCH 4/5] Apply suggestions from code review --- optimum/intel/openvino/modeling_decoder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index f403b76eca..fbb55aea41 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -383,7 +383,6 @@ def prepare_inputs( # Create empty past_key_values for decoder_with_past first generation step elif self.use_cache: - past_len = 0 for input_name in self.key_value_input_names: model_inputs = self.model.input(input_name) shape = model_inputs.get_partial_shape() From 217746129450247e2e627fbd2c96ae5bb22b900a Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 2 Apr 2024 17:29:18 +0400 Subject: [PATCH 5/5] fix position ids and add tests --- optimum/intel/openvino/modeling_decoder.py | 2 +- tests/openvino/test_modeling.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index fbb55aea41..4b156eda9e 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -428,7 +428,7 @@ def prepare_inputs( position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 if past_key_values: - position_ids = np.expand_dims(position_ids[:, -input_ids.shape[1] :], axis=-1) + position_ids = position_ids[:, -input_ids.shape[1] :] inputs["position_ids"] = position_ids diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 65094ae221..f54305113f 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -632,6 +632,11 @@ def test_multiple_inputs(self, model_arch): outputs = model.generate(**tokens, generation_config=generation_config) self.assertIsInstance(outputs, torch.Tensor) self.assertEqual(outputs.shape[0], 3) + # test that generation result is reproducible + outputs2 = model.generate(**tokens, generation_config=generation_config) + self.assertIsInstance(outputs2, torch.Tensor) + self.assertEqual(outputs2.shape[0], 3) + self.assertTrue(torch.allclose(outputs2, outputs)) del model gc.collect()