From d78950f500cc49294e7a6ffe0422d48f0ef94ad4 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 22 Apr 2024 19:13:19 +0200
Subject: [PATCH 1/6] Move calibration dataset construction to WC function

---
 optimum/intel/openvino/modeling_decoder.py | 22 +++++++---------------
 optimum/intel/openvino/quantization.py     |  7 +++++--
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 39a7bee9a2..ca7352076c 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -624,29 +624,21 @@ def _from_pretrained(
                 raise ImportError(
                     "Quantization of the weights requires nncf, please install it with `pip install nncf`"
                 )
-            import nncf
 
-            from .quantization import _weight_only_quantization
+            from optimum.intel.openvino.quantization import _weight_only_quantization
 
             default_config = _check_default_4bit_configs(config)
-
             if default_config:
                 logger.info(
                     f"For the given model, we recommend the following `quantization_config` : {default_config}"
                 )
 
-            calibration_dataset = None
-            if isinstance(quantization_config.dataset, str):
-                tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id)
-
-                from optimum.gptq.data import get_dataset, prepare_dataset
-
-                nsamples = quantization_config.num_samples or 128
-                dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
-                dataset = prepare_dataset(dataset)
-                calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))
-
-            _weight_only_quantization(model, quantization_config, calibration_dataset)
+            _weight_only_quantization(
+                model,
+                quantization_config,
+                tokenizer=AutoTokenizer.from_pretrained(quantization_config.tokenizer or model_id),
+                transform_fn=lambda x: causal_model.prepare_inputs(**x),
+            )
 
         return causal_model
 
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 217e5e4056..eb55909a8a 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -33,7 +33,7 @@
 from openvino.runtime import Core, Tensor
 from torch.utils._pytree import tree_map
 from torch.utils.data import DataLoader, RandomSampler
-from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
+from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator, PreTrainedTokenizer
 from transformers.pytorch_utils import Conv1D
 from transformers.utils import is_accelerate_available
 
@@ -622,6 +622,8 @@ def _weight_only_quantization(
     model: openvino.runtime.Model,
     quantization_config: Union[OVWeightQuantizationConfig, Dict],
     calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None,
+    tokenizer: Optional[PreTrainedTokenizer] = None,
+    transform_fn: Optional[Callable] = None,
 ) -> openvino.runtime.Model:
     config = quantization_config
     if isinstance(config, dict):
@@ -645,13 +647,14 @@ def _weight_only_quantization(
         else:
             dataset = nncf.Dataset(calibration_dataset)
     elif config.dataset is not None and isinstance(config.dataset, str):
-        tokenizer = AutoTokenizer.from_pretrained(config.tokenizer)
+        tokenizer = tokenizer or AutoTokenizer.from_pretrained(config.tokenizer)
 
         from optimum.gptq.data import get_dataset, prepare_dataset
 
         nsamples = config.num_samples if config.num_samples else 128
         dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
         dataset = prepare_dataset(dataset)
+        dataset = nncf.Dataset(dataset, transform_fn)
 
     sensitivity_metric = None
     if isinstance(config.sensitivity_metric, str):

From 99471b2f8dec0f93568064e1e321dd1c61006e63 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 23 Apr 2024 15:34:20 +0200
Subject: [PATCH 2/6] Tweak tokenizer

---
 optimum/intel/openvino/modeling_decoder.py | 7 ++++---
 optimum/intel/openvino/quantization.py     | 3 +--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index ca7352076c..18ee086fbd 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -11,7 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
+import copy
 import logging
 import os
 from pathlib import Path
@@ -633,10 +633,11 @@ def _from_pretrained(
                     f"For the given model, we recommend the following `quantization_config` : {default_config}"
                 )
 
+            quantization_config_copy = copy.deepcopy(quantization_config)
+            quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
             _weight_only_quantization(
                 model,
-                quantization_config,
-                tokenizer=AutoTokenizer.from_pretrained(quantization_config.tokenizer or model_id),
+                quantization_config_copy,
                 transform_fn=lambda x: causal_model.prepare_inputs(**x),
             )
 
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index eb55909a8a..c31cc5bacc 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -622,7 +622,6 @@ def _weight_only_quantization(
     model: openvino.runtime.Model,
     quantization_config: Union[OVWeightQuantizationConfig, Dict],
     calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None,
-    tokenizer: Optional[PreTrainedTokenizer] = None,
     transform_fn: Optional[Callable] = None,
 ) -> openvino.runtime.Model:
     config = quantization_config
@@ -647,7 +646,7 @@ def _weight_only_quantization(
         else:
             dataset = nncf.Dataset(calibration_dataset)
     elif config.dataset is not None and isinstance(config.dataset, str):
-        tokenizer = tokenizer or AutoTokenizer.from_pretrained(config.tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(config.tokenizer)
 
         from optimum.gptq.data import get_dataset, prepare_dataset
 

From b986830826d832c4f319eb6c9849ab623e0f6085 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 23 Apr 2024 15:36:48 +0200
Subject: [PATCH 3/6] Removed not used import

---
 optimum/intel/openvino/quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index c31cc5bacc..39cd7ead94 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -33,7 +33,7 @@
 from openvino.runtime import Core, Tensor
 from torch.utils._pytree import tree_map
 from torch.utils.data import DataLoader, RandomSampler
-from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator, PreTrainedTokenizer
+from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
 from transformers.pytorch_utils import Conv1D
 from transformers.utils import is_accelerate_available
 

From cdbedb42aad95116743241fd7142bdc1a6101566 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 23 Apr 2024 15:52:27 +0200
Subject: [PATCH 4/6] ruff

---
 optimum/intel/openvino/modeling_decoder.py |  2 +-
 optimum/intel/openvino/quantization.py     | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 18ee086fbd..2bf33eb38d 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -23,7 +23,7 @@
 import torch
 from openvino.preprocess import PrePostProcessor
 from openvino.runtime import Core, Tensor, Type
-from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
+from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithPast
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 39cd7ead94..66c7dab498 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -21,14 +21,9 @@
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import datasets
-import nncf
 import openvino
 import torch
 import transformers
-from nncf import CompressWeightsMode, SensitivityMetric
-from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix
-from nncf.torch import register_module
-from nncf.torch.initialization import PTInitializingDataLoader
 from openvino._offline_transformations import compress_quantize_weights_transformation
 from openvino.runtime import Core, Tensor
 from torch.utils._pytree import tree_map
@@ -37,6 +32,11 @@
 from transformers.pytorch_utils import Conv1D
 from transformers.utils import is_accelerate_available
 
+import nncf
+from nncf import CompressWeightsMode, SensitivityMetric
+from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix
+from nncf.torch import register_module
+from nncf.torch.initialization import PTInitializingDataLoader
 from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
 from optimum.exporters.tasks import TasksManager
 from optimum.quantization_base import OptimumQuantizer

From fa4065f5f64953a339f94d3ae5770b7150e15823 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 23 Apr 2024 15:53:47 +0200
Subject: [PATCH 5/6] ruff 2

---
 optimum/intel/openvino/quantization.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 66c7dab498..39cd7ead94 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -21,9 +21,14 @@
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import datasets
+import nncf
 import openvino
 import torch
 import transformers
+from nncf import CompressWeightsMode, SensitivityMetric
+from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix
+from nncf.torch import register_module
+from nncf.torch.initialization import PTInitializingDataLoader
 from openvino._offline_transformations import compress_quantize_weights_transformation
 from openvino.runtime import Core, Tensor
 from torch.utils._pytree import tree_map
@@ -32,11 +37,6 @@
 from transformers.pytorch_utils import Conv1D
 from transformers.utils import is_accelerate_available
 
-import nncf
-from nncf import CompressWeightsMode, SensitivityMetric
-from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix
-from nncf.torch import register_module
-from nncf.torch.initialization import PTInitializingDataLoader
 from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
 from optimum.exporters.tasks import TasksManager
 from optimum.quantization_base import OptimumQuantizer

From ea3f2113df52015e309aacd6a92ef85eba9580f5 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 25 Apr 2024 09:51:34 +0200
Subject: [PATCH 6/6] Refactor through OVQuantizer call

---
 optimum/intel/openvino/modeling_decoder.py | 10 ++--
 optimum/intel/openvino/quantization.py     | 57 +++++++++++++---------
 2 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 2bf33eb38d..3acd18dab0 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -625,21 +625,19 @@ def _from_pretrained(
                     "Quantization of the weights requires nncf, please install it with `pip install nncf`"
                 )
 
-            from optimum.intel.openvino.quantization import _weight_only_quantization
+            from optimum.intel.openvino.quantization import OVQuantizer
 
             default_config = _check_default_4bit_configs(config)
+
             if default_config:
                 logger.info(
                     f"For the given model, we recommend the following `quantization_config` : {default_config}"
                 )
 
+            quantizer = OVQuantizer(causal_model)
             quantization_config_copy = copy.deepcopy(quantization_config)
             quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
-            _weight_only_quantization(
-                model,
-                quantization_config_copy,
-                transform_fn=lambda x: causal_model.prepare_inputs(**x),
-            )
+            quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
 
         return causal_model
 
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 39cd7ead94..f3b09f5aa7 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -198,7 +198,7 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs):
     def quantize(
         self,
         calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None,
-        save_directory: Union[str, Path] = None,
+        save_directory: Optional[Union[str, Path]] = None,
         ov_config: OVConfig = None,
         file_name: Optional[str] = None,
         batch_size: int = 1,
@@ -214,7 +214,7 @@ def quantize(
             calibration_dataset (`datasets.Dataset` or `nncf.Dataset` or `Iterable`, *optional*):
                 A collection of data samples to use for quantization calibration. Is optional for weight-only
                 quantization and is required for full quantization.
-            save_directory (`Union[str, Path]`):
+            save_directory (`Union[str, Path]`, *optional*):
                 The directory where the quantized model should be saved.
             ov_config (`OVConfig`, *optional*):
                 The configuration containing the parameters related to quantization. If not provided, 8-bit symmetric
@@ -262,10 +262,6 @@ def quantize(
                 "as an instance of `OVWeightQuantizationConfig` for weight-only compression or as an instance of `OVQuantizationConfig` for full model quantization."
             )
 
-        if save_directory is None:
-            # TODO : can be set to self.model.config.name_or_path for OVModels when not provided
-            raise ValueError("`save_directory` needs to be specified")
-
         if ov_config is None:
             ov_config = OVConfig()
         if not isinstance(ov_config, OVConfig):
@@ -318,21 +314,41 @@ def quantize(
     def _quantize_ovbasemodel(
         self,
         ov_config: OVConfig,
-        save_directory: Union[str, Path],
+        save_directory: Union[str, Path] = None,
         calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None,
         batch_size: int = 1,
         data_collator: Optional[DataCollator] = None,
         remove_unused_columns: bool = True,
         **kwargs,
     ):
-        save_directory = Path(save_directory)
-        save_directory.mkdir(parents=True, exist_ok=True)
+        if save_directory is not None:
+            save_directory = Path(save_directory)
+            save_directory.mkdir(parents=True, exist_ok=True)
 
         quantization_config = ov_config.quantization_config
         if isinstance(quantization_config, OVWeightQuantizationConfig):
+            if calibration_dataset is None and isinstance(quantization_config.dataset, str):
+                from optimum.intel import OVModelForCausalLM
+
+                if isinstance(self.model, OVModelForCausalLM):
+                    from optimum.gptq.data import get_dataset, prepare_dataset
+
+                    tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer)
+                    nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
+                    calibration_dataset = get_dataset(
+                        quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples
+                    )
+                    calibration_dataset = prepare_dataset(calibration_dataset)
+                    calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x))
+                else:
+                    raise ValueError(
+                        f"Can't create weight compression calibration dataset from string for {type(self.model)}"
+                    )
+
             _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
-            self.model.save_pretrained(save_directory)
-            ov_config.save_pretrained(save_directory)
+            if save_directory is not None:
+                self.model.save_pretrained(save_directory)
+                ov_config.save_pretrained(save_directory)
             return
         if not isinstance(quantization_config, OVQuantizationConfig):
             raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}")
@@ -384,8 +400,9 @@ def _quantize_ovbasemodel(
             **kwargs,
         )
         self.model.model = quantized_model
-        self.model.save_pretrained(save_directory)
-        ov_config.save_pretrained(save_directory)
+        if save_directory is not None:
+            self.model.save_pretrained(save_directory)
+            ov_config.save_pretrained(save_directory)
 
     def _quantize_torchmodel(
         self,
@@ -398,6 +415,10 @@ def _quantize_torchmodel(
         remove_unused_columns: bool = True,
         **kwargs,
     ):
+        if save_directory is None:
+            # TODO : can be set to self.model.config.name_or_path for OVModels when not provided
+            raise ValueError("`save_directory` needs to be specified")
+
         self._set_task()
         save_directory = Path(save_directory)
         save_directory.mkdir(parents=True, exist_ok=True)
@@ -622,7 +643,6 @@ def _weight_only_quantization(
     model: openvino.runtime.Model,
     quantization_config: Union[OVWeightQuantizationConfig, Dict],
     calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None,
-    transform_fn: Optional[Callable] = None,
 ) -> openvino.runtime.Model:
     config = quantization_config
     if isinstance(config, dict):
@@ -645,15 +665,6 @@ def _weight_only_quantization(
             dataset = calibration_dataset
         else:
             dataset = nncf.Dataset(calibration_dataset)
-    elif config.dataset is not None and isinstance(config.dataset, str):
-        tokenizer = AutoTokenizer.from_pretrained(config.tokenizer)
-
-        from optimum.gptq.data import get_dataset, prepare_dataset
-
-        nsamples = config.num_samples if config.num_samples else 128
-        dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
-        dataset = prepare_dataset(dataset)
-        dataset = nncf.Dataset(dataset, transform_fn)
 
     sensitivity_metric = None
     if isinstance(config.sensitivity_metric, str):