From 6dc45244c43e5b30102c5e714a1cb4643d47a652 Mon Sep 17 00:00:00 2001 From: Alexander Date: Mon, 19 Feb 2024 18:40:35 +0400 Subject: [PATCH 1/5] [OV]: Fixed inferece after 4 bit weight compression --- optimum/intel/openvino/modeling_decoder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 08165578f0..52175661d9 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -592,7 +592,9 @@ def _from_pretrained( else: init_cls = cls - causal_model = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs) + causal_model = init_cls( + model=model, config=config, model_save_dir=model_cache_path.parent, compile=not load_in_4bit, **kwargs + ) if load_in_4bit: if not is_nncf_available(): From 7188696edab57759aafdf2c921cf9dcce85d0949 Mon Sep 17 00:00:00 2001 From: Alexander Date: Mon, 19 Feb 2024 19:50:36 +0400 Subject: [PATCH 2/5] Fixed issue --- optimum/intel/openvino/modeling_decoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 52175661d9..5285ac24dd 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -611,6 +611,7 @@ def _from_pretrained( ) _weight_only_quantization(causal_model, quantization_config) + causal_model.request = None return causal_model From d06a344035f30cae959faba71cd1a1c715b7e994 Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Tue, 20 Feb 2024 09:57:22 +0400 Subject: [PATCH 3/5] Update optimum/intel/openvino/modeling_decoder.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/intel/openvino/modeling_decoder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 5285ac24dd..3fb79703ba 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -592,8 +592,9 @@ def _from_pretrained( else: init_cls = cls + enable_compilation = kwargs.pop("compile", True) and not load_in_4bit causal_model = init_cls( - model=model, config=config, model_save_dir=model_cache_path.parent, compile=not load_in_4bit, **kwargs + model=model, config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, **kwargs ) if load_in_4bit: From c816e9672e130f70f2dbe3c2e1a46ec7a87bd787 Mon Sep 17 00:00:00 2001 From: Alexander Date: Tue, 20 Feb 2024 10:00:33 +0400 Subject: [PATCH 4/5] Applied comments --- optimum/intel/openvino/modeling_decoder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 3fb79703ba..9abb5c420a 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -612,7 +612,6 @@ def _from_pretrained( ) _weight_only_quantization(causal_model, quantization_config) - causal_model.request = None return causal_model From ede353ea40e7216c71529c5e19473b8aa9161e81 Mon Sep 17 00:00:00 2001 From: Alexander Date: Tue, 20 Feb 2024 15:24:04 +0400 Subject: [PATCH 5/5] Fixed issue when request is None --- optimum/intel/openvino/modeling_decoder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 9abb5c420a..8bcf877bff 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -419,7 +419,8 @@ def prepare_inputs( # past_key_values are not used explicitly, instead they are handled inside the model if past_key_values is None: # This is the first iteration in a sequence, reset all states - self.request.reset_state() + if self.request is not None: + self.request.reset_state() # Set initial value for the next beam_idx input that will be used at the current iteration # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used self.next_beam_idx = np.arange(batch_size, dtype=int)