fix quantization

eaidova · eaidova · commit 27b30acfde0b · 2024-12-18T15:17:44.000+04:00
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
@@ -577,6 +577,7 @@ def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2Se
         is_legacy = any("past_key_values" in key.get_any_name() for key in self.model.outputs)
         self.use_past = len(self.key_value_input_names) > 0 or self.stateful
         self.next_beam_idx = None
+        self._past_length = 0
 
         if len(self.key_value_input_names) > 0 and not is_legacy:
             self.use_past = True
@@ -625,7 +626,7 @@ def forward(
 
         if self.stateful and past_key_values is None:
             self.request.reset_state()
-            self._past_len = 0
+            self._past_length = 0
 
         if past_key_values is not None and not self.stateful:
             # Flatten the past_key_values
@@ -664,7 +665,7 @@ def forward(
         self.request.start_async(inputs, share_inputs=True)
         self.request.wait()
         logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
-        self._past_len += input_ids.shape[1]
+        self._past_length += input_ids.shape[1]
 
         out_past_key_values = ()
 
@@ -689,6 +690,13 @@ def forward(
 
         return Seq2SeqLMOutput(logits=logits, past_key_values=out_past_key_values)
 
+    def _get_past_length(self, past_key_values=None):
+        if past_key_values is None:
+            return 0
+        if self.stateful:
+            return self._past_length
+        return past_key_values[0][0].shape[-2]
+
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
@@ -1074,10 +1082,7 @@ def prepare_inputs_for_generation(
 
         past_length = 0
         if past_key_values is not None:
-            if self.decoder.stateful:
-                past_length = getattr(self.decoder, "_past_len", 0)
-            else:
-                past_length = past_key_values[0][0].shape[2]
+            self.decoder._get_past_length(past_key_values)
 
             # Some generation methods already pass only the last input ID
             if decoder_input_ids.shape[1] > past_length:
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -828,12 +828,14 @@ def _prepare_speech_to_text_calibration_data(self, config: OVQuantizationConfigB
             decoder_model.request, decoder_calibration_data, apply_caching=True
         )
 
-        decoder_w_p_calibration_data = []
-        decoder_w_p_model = self.model.decoder_with_past
-        decoder_w_p_model._compile()
-        decoder_w_p_model.request = InferRequestWrapper(
-            decoder_w_p_model.request, decoder_w_p_calibration_data, apply_caching=True
-        )
+        decoder_w_p_model = None
+        if self.model.decoder_with_past_model is not None:
+            decoder_w_p_calibration_data = []
+            decoder_w_p_model = self.model.decoder_with_past
+            decoder_w_p_model._compile()
+            decoder_w_p_model.request = InferRequestWrapper(
+                decoder_w_p_model.request, decoder_w_p_calibration_data, apply_caching=True
+            )
 
         dataset_metadata = PREDEFINED_SPEECH_TO_TEXT_DATASETS[config.dataset]
 
@@ -867,13 +869,13 @@ def _prepare_speech_to_text_calibration_data(self, config: OVQuantizationConfigB
         finally:
             encoder_model.request = encoder_model.request.request
             decoder_model.request = decoder_model.request.request
-            decoder_w_p_model.request = decoder_w_p_model.request.request
+            if decoder_w_p_model is not None:
+                decoder_w_p_model.request = decoder_w_p_model.request.request
 
-        return (
-            nncf.Dataset(encoder_calibration_data),
-            nncf.Dataset(decoder_calibration_data),
-            nncf.Dataset(decoder_w_p_calibration_data),
-        )
+        datasets = [nncf.Dataset(encoder_calibration_data), nncf.Dataset(decoder_calibration_data),]
+        if decoder_w_p_model is not None:
+            datasets.append(nncf.Dataset(decoder_w_p_calibration_data))
+        return datasets
 
     def _prepare_text_generation_calibration_data(
         self, quantization_config: OVQuantizationConfigBase, calibration_dataloader: OVDataLoader
@@ -986,15 +988,16 @@ def _quantize_whisper_model(self, quantization_config, calibration_dataset, **kw
         self.model.decoder.model = quantized_decoder_model
         self.model.decoder.request = None
 
-        # Quantize decoder with past model
-        config = copy.deepcopy(quantization_config)
-        config.num_samples = calibration_dataset[2].get_length()
-        quantized_decoder_w_p_model = _full_quantization(
-            self.model.decoder_with_past_model, config, calibration_dataset[2], **kwargs
-        )
-        self.model.decoder_with_past_model = quantized_decoder_w_p_model
-        self.model.decoder_with_past.model = quantized_decoder_w_p_model
-        self.model.decoder_with_past.request = None
+        if self.model.decoder_with_past_model is not None:
+            # Quantize decoder with past model
+            config = copy.deepcopy(quantization_config)
+            config.num_samples = calibration_dataset[2].get_length()
+            quantized_decoder_w_p_model = _full_quantization(
+                self.model.decoder_with_past_model, config, calibration_dataset[2], **kwargs
+            )
+            self.model.decoder_with_past_model = quantized_decoder_w_p_model
+            self.model.decoder_with_past.model = quantized_decoder_w_p_model
+            self.model.decoder_with_past.request = None
 
 
 def _weight_only_quantization(