[OV]: Fixed inference after 4 bit weight compression (#569)

AlexKoff88 · echarlaix · web-flow · commit 18ba0bda5ebe · 2024-02-21T09:49:33.000+01:00
* [OV]: Fixed inferece after 4 bit weight compression

* Fixed issue

* Update optimum/intel/openvino/modeling_decoder.py

Co-authored-by: Ella Charlaix &lt;80481427+echarlaix@users.noreply.github.com&gt;

* Applied comments

* Fixed issue when request is None

---------

Co-authored-by: Ella Charlaix &lt;80481427+echarlaix@users.noreply.github.com&gt;
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -419,7 +419,8 @@ def prepare_inputs(
             # past_key_values are not used explicitly, instead they are handled inside the model
             if past_key_values is None:
                 # This is the first iteration in a sequence, reset all states
-                self.request.reset_state()
+                if self.request is not None:
+                    self.request.reset_state()
                 # Set initial value for the next beam_idx input that will be used at the current iteration
                 # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
                 self.next_beam_idx = np.arange(batch_size, dtype=int)
@@ -592,7 +593,10 @@ def _from_pretrained(
         else:
             init_cls = cls
 
-        causal_model = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
+        enable_compilation = kwargs.pop("compile", True) and not load_in_4bit
+        causal_model = init_cls(
+            model=model, config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, **kwargs
+        )
 
         if load_in_4bit:
             if not is_nncf_available():