From 56878bb0e108338f11cd6a4a37d0d81b0bb060cb Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 21 May 2024 14:53:16 +0200
Subject: [PATCH 01/28] Add quantization with dataset after model export for
 text-generation models

---
 optimum/commands/export/openvino.py    | 63 +++++++++++++++++++++++++-
 optimum/exporters/openvino/__main__.py | 30 ++++++------
 tests/openvino/test_exporters_cli.py   | 12 ++++-
 3 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index ffd084d4e6..310ec30c97 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -15,6 +15,7 @@
 
 import logging
 import sys
+import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
@@ -128,6 +129,29 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "compression is applied, they are compressed to INT8."
         ),
     )
+    optional_group.add_argument(
+        "--quant-method",
+        type=str,
+        default=None,
+        choices=["default", "awq", "hybrid"],
+        help=("The quantization method to apply. Can be one of the following: ['default', 'awq', 'hybrid']."),
+    )
+    optional_group.add_argument(
+        "--sensitivity-metric",
+        type=str,
+        default=None,
+        help=(
+            "The sensitivity metric for assigning quantization precision to layers. Can be one of the following: "
+            "['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', "
+            "'max_activation_variance', 'mean_activation_magnitude']."
+        ),
+    )
+    optional_group.add_argument(
+        "--num-samples",
+        type=int,
+        default=None,
+        help=("The maximum number of samples composing the calibration dataset for quantization."),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -180,7 +204,7 @@ def parse_args(parser: "ArgumentParser"):
         return parse_args_openvino(parser)
 
     def run(self):
-        from ...exporters.openvino.__main__ import main_export
+        from ...exporters.openvino.__main__ import main_export, infer_task
         from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig
 
         if self.args.fp16:
@@ -208,6 +232,9 @@ def run(self):
                 and self.args.group_size is None
                 and self.args.sym is None
                 and self.args.all_layers is None
+                and self.args.dataset is None
+                and self.args.quant_method is None
+                and self.args.sensitivity_metric is None
                 and self.args.model in _DEFAULT_4BIT_CONFIGS
             ):
                 quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model]
@@ -218,6 +245,10 @@ def run(self):
                     "sym": self.args.sym or False,
                     "group_size": -1 if is_int8 else self.args.group_size,
                     "all_layers": None if is_int8 else self.args.all_layers,
+                    "dataset": self.args.dataset,
+                    "num_samples": self.args.num_samples,
+                    "quant_method": self.args.quant_method,
+                    "sensitivity_metric": self.args.sensitivity_metric,
                 }
 
             if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
@@ -226,7 +257,6 @@ def run(self):
                 )
                 quantization_config["sym"] = "asym" not in self.args.weight_format
                 quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
-            quantization_config["dataset"] = self.args.dataset
             ov_config = OVConfig(quantization_config=quantization_config)
 
         library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library)
@@ -290,6 +320,19 @@ def run(self):
             if tokenizer_2 is not None:
                 export_tokenizer(tokenizer_2, output / "tokenizer_2")
         else:
+            task = infer_task(self.args.task, self.args.model)
+            quantization_config = ov_config.quantization_config
+            quantize_after_export = (
+                task.startswith("text-generation")
+                and quantization_config is not None
+                and hasattr(quantization_config, "dataset")
+                and quantization_config.dataset is not None
+            )
+            if quantize_after_export:
+                # In order to quantize a text-generation model with a dataset, an instance of OVModelForCausalLM is
+                # required. That's why the quantization is skipped during export and applied explicitly after export.
+                ov_config.quantization_config = None
+
             # TODO : add input shapes
             main_export(
                 model_name_or_path=self.args.model,
@@ -305,3 +348,19 @@ def run(self):
                 library_name=library_name,
                 # **input_shapes,
             )
+
+            if quantize_after_export:
+                from optimum.intel import OVModelForCausalLM, OVQuantizer
+
+                model = OVModelForCausalLM.from_pretrained(self.args.output)
+                quantizer = OVQuantizer(model)
+                quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)
+                # TODO: set save_directory=self.args.output once OV is updated to 2024.3
+                quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    import shutil
+
+                    model.save_pretrained(temp_dir)
+                    ov_config.save_pretrained(self.args.output)
+                    shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml")
+                    shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin")
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 9db6719069..1204c8d4cf 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -44,6 +44,22 @@
 logger = logging.getLogger(__name__)
 
 
+def infer_task(task, model_name_or_path):
+    task = TasksManager.map_from_synonym(task)
+    if task == "auto":
+        try:
+            task = TasksManager.infer_task_from_model(model_name_or_path)
+        except KeyError as e:
+            raise KeyError(
+                f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+        except RequestsConnectionError as e:
+            raise RequestsConnectionError(
+                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+    return task
+
+
 def main_export(
     model_name_or_path: str,
     output: Union[str, Path],
@@ -174,7 +190,7 @@ def main_export(
             ov_config = OVConfig(quantization_config=q_config)
 
     original_task = task
-    task = TasksManager.map_from_synonym(task)
+    task = infer_task(task, model_name_or_path)
     framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
     library_name_is_not_provided = library_name is None
     library_name = TasksManager.infer_library_from_model(
@@ -188,18 +204,6 @@ def main_export(
         )
         library_name = "transformers"
 
-    if task == "auto":
-        try:
-            task = TasksManager.infer_task_from_model(model_name_or_path)
-        except KeyError as e:
-            raise KeyError(
-                f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-        except RequestsConnectionError as e:
-            raise RequestsConnectionError(
-                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-
     do_gptq_patching = False
     custom_architecture = False
     loading_kwargs = {}
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index cce25bbae1..21ced61edb 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -89,6 +89,14 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86),
         ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86),
         ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32),
+        (
+            "text-generation-with-past",
+            "llama_awq",
+            "int4 --ratio 1.0 --sym --group-size 16 --quant-method awq --dataset wikitext2 --num-samples 100 "
+            "--sensitivity-metric max_activation_variance",
+            4,
+            28,
+        ),
     ]
 
     def _openvino_export(
@@ -197,10 +205,11 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
     @parameterized.expand(TEST_4BIT_CONFIGURATONS)
     def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int):
         with TemporaryDirectory() as tmpdir:
-            subprocess.run(
+            result = subprocess.run(
                 f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
                 shell=True,
                 check=True,
+                capture_output=True,
             )
             model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {}
             model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs)
@@ -208,6 +217,7 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec
             _, num_int8, num_int4 = get_num_quantized_nodes(model)
             self.assertEqual(expected_int8, num_int8)
             self.assertEqual(expected_int4, num_int4)
+            self.assertTrue("--quant-method awq" not in option or b"Applying AWQ" in result.stdout)
 
     def test_exporters_cli_help(self):
         subprocess.run(

From 013a0f656fc015db97bbaabb4cc79634e2cd98db Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 21 May 2024 15:22:46 +0200
Subject: [PATCH 02/28] Tweak AWQ CLI interface

---
 optimum/commands/export/openvino.py  | 12 +++++-------
 tests/openvino/test_exporters_cli.py |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 310ec30c97..d6d0114d93 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -130,11 +130,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
         ),
     )
     optional_group.add_argument(
-        "--quant-method",
-        type=str,
+        "--awq",
+        action="store_true",
         default=None,
-        choices=["default", "awq", "hybrid"],
-        help=("The quantization method to apply. Can be one of the following: ['default', 'awq', 'hybrid']."),
+        help="Whether to apply AWQ algorithm. To run AWQ, please also provide a dataset.",
     )
     optional_group.add_argument(
         "--sensitivity-metric",
@@ -150,7 +149,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
         "--num-samples",
         type=int,
         default=None,
-        help=("The maximum number of samples composing the calibration dataset for quantization."),
+        help="The maximum number of samples to take from the dataset for quantization.",
     )
     optional_group.add_argument(
         "--disable-stateful",
@@ -233,7 +232,6 @@ def run(self):
                 and self.args.sym is None
                 and self.args.all_layers is None
                 and self.args.dataset is None
-                and self.args.quant_method is None
                 and self.args.sensitivity_metric is None
                 and self.args.model in _DEFAULT_4BIT_CONFIGS
             ):
@@ -247,7 +245,7 @@ def run(self):
                     "all_layers": None if is_int8 else self.args.all_layers,
                     "dataset": self.args.dataset,
                     "num_samples": self.args.num_samples,
-                    "quant_method": self.args.quant_method,
+                    "quant_method": "awq" if self.args.awq else None,
                     "sensitivity_metric": self.args.sensitivity_metric,
                 }
 
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 21ced61edb..267aa88d62 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -92,7 +92,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         (
             "text-generation-with-past",
             "llama_awq",
-            "int4 --ratio 1.0 --sym --group-size 16 --quant-method awq --dataset wikitext2 --num-samples 100 "
+            "int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 "
             "--sensitivity-metric max_activation_variance",
             4,
             28,

From c566ccc094d4999dba3888a55ea43520e08850f5 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 21 May 2024 15:24:45 +0200
Subject: [PATCH 03/28] Additional checks

---
 optimum/commands/export/openvino.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index d6d0114d93..4594cb387e 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -232,6 +232,8 @@ def run(self):
                 and self.args.sym is None
                 and self.args.all_layers is None
                 and self.args.dataset is None
+                and self.args.num_samples is None
+                and self.args.awq is None
                 and self.args.sensitivity_metric is None
                 and self.args.model in _DEFAULT_4BIT_CONFIGS
             ):

From 0a8fba022c92058c923f5af46370c53419a28df5 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 21 May 2024 16:26:04 +0200
Subject: [PATCH 04/28] Fix

---
 optimum/commands/export/openvino.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 4594cb387e..42f09c1576 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -203,7 +203,7 @@ def parse_args(parser: "ArgumentParser"):
         return parse_args_openvino(parser)
 
     def run(self):
-        from ...exporters.openvino.__main__ import main_export, infer_task
+        from ...exporters.openvino.__main__ import infer_task, main_export
         from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig
 
         if self.args.fp16:
@@ -321,10 +321,10 @@ def run(self):
                 export_tokenizer(tokenizer_2, output / "tokenizer_2")
         else:
             task = infer_task(self.args.task, self.args.model)
-            quantization_config = ov_config.quantization_config
+            quantization_config = ov_config.quantization_config if ov_config else None
             quantize_after_export = (
                 task.startswith("text-generation")
-                and quantization_config is not None
+                and quantization_config
                 and hasattr(quantization_config, "dataset")
                 and quantization_config.dataset is not None
             )

From 6dbb4fe8c1bca94f3a240ec9e0e312a58b854d18 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 21 May 2024 19:00:15 +0200
Subject: [PATCH 05/28] Trigger Build


From 3722624bf011fe651d478fe0b4c21c5b7e729486 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 22 May 2024 15:43:54 +0200
Subject: [PATCH 06/28] Add AWQ description

---
 optimum/commands/export/openvino.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 42f09c1576..758b6ae65d 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -133,7 +133,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
         "--awq",
         action="store_true",
         default=None,
-        help="Whether to apply AWQ algorithm. To run AWQ, please also provide a dataset.",
+        help=(
+            "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires "
+            "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset."
+        ),
     )
     optional_group.add_argument(
         "--sensitivity-metric",

From dee582d2abb0087564bce0ff7e62511fa8328048 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 22 May 2024 16:37:26 +0200
Subject: [PATCH 07/28] Add trust remote code argument

---
 optimum/commands/export/openvino.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 758b6ae65d..549b52c750 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -355,7 +355,10 @@ def run(self):
             if quantize_after_export:
                 from optimum.intel import OVModelForCausalLM, OVQuantizer
 
-                model = OVModelForCausalLM.from_pretrained(self.args.output)
+                model = OVModelForCausalLM.from_pretrained(
+                    self.args.output,
+                    trust_remote_code=self.args.trust_remote_code
+                )
                 quantizer = OVQuantizer(model)
                 quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)
                 # TODO: set save_directory=self.args.output once OV is updated to 2024.3

From a44c0960ec00966d4b36ecf7a49ab9bfd24b5e61 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 22 May 2024 16:47:14 +0200
Subject: [PATCH 08/28] Black

---
 optimum/commands/export/openvino.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 549b52c750..e72b876212 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -356,8 +356,7 @@ def run(self):
                 from optimum.intel import OVModelForCausalLM, OVQuantizer
 
                 model = OVModelForCausalLM.from_pretrained(
-                    self.args.output,
-                    trust_remote_code=self.args.trust_remote_code
+                    self.args.output, trust_remote_code=self.args.trust_remote_code
                 )
                 quantizer = OVQuantizer(model)
                 quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)

From 12dc6720ef097e29fac44612a936ce2fc9478a88 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 22 May 2024 17:03:32 +0200
Subject: [PATCH 09/28] Add note about possibility of skipping AWQ

---
 optimum/commands/export/openvino.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index e72b876212..5a7e466ebc 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -135,7 +135,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=(
             "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires "
-            "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset."
+            "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset "
+            "argument. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in such "
+            "case it will be skipped."
         ),
     )
     optional_group.add_argument(

From bcc46652a8c47f09dd92d6a0ec4185c8da20ddb1 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 23 May 2024 13:28:17 +0200
Subject: [PATCH 10/28] Removed saving to temporary directory; added core
 property handling for OVModelForCausalLM

---
 optimum/commands/export/openvino.py        | 16 +++++-----------
 optimum/intel/openvino/modeling_base.py    |  5 +++++
 optimum/intel/openvino/modeling_decoder.py |  7 +++++++
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 5a7e466ebc..1276340871 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -15,7 +15,6 @@
 
 import logging
 import sys
-import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
@@ -357,17 +356,12 @@ def run(self):
             if quantize_after_export:
                 from optimum.intel import OVModelForCausalLM, OVQuantizer
 
+                # TODO: remove disabling mmap once OV is updated to 2024.3
                 model = OVModelForCausalLM.from_pretrained(
-                    self.args.output, trust_remote_code=self.args.trust_remote_code
+                    self.args.output, trust_remote_code=self.args.trust_remote_code, ov_config={"ENABLE_MMAP": "NO"}
                 )
                 quantizer = OVQuantizer(model)
                 quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)
-                # TODO: set save_directory=self.args.output once OV is updated to 2024.3
-                quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
-                with tempfile.TemporaryDirectory() as temp_dir:
-                    import shutil
-
-                    model.save_pretrained(temp_dir)
-                    ov_config.save_pretrained(self.args.output)
-                    shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml")
-                    shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin")
+                quantizer.quantize(
+                    ov_config=OVConfig(quantization_config=quantization_config), save_directory=self.args.output
+                )
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 7937deea52..e61a1c7eca 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -103,6 +103,7 @@ def __init__(
     def load_model(
         file_name: Union[str, Path],
         quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        ov_core_properties: Optional[Dict] = None,
     ):
         """
         Loads the model.
@@ -112,6 +113,8 @@ def load_model(
                 The path of the model ONNX or XML file.
             quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*):
                 Quantization config to apply after model is loaded.
+            ov_core_properties (`Dict`, *optional*):
+                OpenVINO core properties to set before model loading.
         """
 
         def fix_op_names_duplicates(model: openvino.runtime.Model):
@@ -128,6 +131,8 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
 
         if isinstance(file_name, str):
             file_name = Path(file_name)
+        if ov_core_properties:
+            core.set_property(ov_core_properties)
         model = core.read_model(file_name) if not file_name.suffix == ".onnx" else convert_model(file_name)
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 2ad04ab14a..cf913c15b1 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -748,9 +748,16 @@ def _from_pretrained(
 
         load_in_4bit = quantization_config.bits == 4 if quantization_config else False
 
+        ov_config = kwargs.get("ov_config", None)
+        ov_core_properties = {}
+        if ov_config and "ENABLE_MMAP" in ov_config:
+            ov_core_properties["ENABLE_MMAP"] = ov_config["ENABLE_MMAP"]
+            del ov_config["ENABLE_MMAP"]
+
         model = cls.load_model(
             model_cache_path,
             quantization_config=None if load_in_4bit else quantization_config,
+            ov_core_properties=ov_core_properties,
         )
 
         model_type = config.model_type.replace("_", "-")

From 40058dad55b7fe010cf03c62ae7b5ac8361d1847 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 23 May 2024 13:48:02 +0200
Subject: [PATCH 11/28] Revert "Removed saving to temporary directory; added
 core property handling for OVModelForCausalLM"

This reverts commit bcc46652a8c47f09dd92d6a0ec4185c8da20ddb1.
---
 optimum/commands/export/openvino.py        | 16 +++++++++++-----
 optimum/intel/openvino/modeling_base.py    |  5 -----
 optimum/intel/openvino/modeling_decoder.py |  7 -------
 3 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 1276340871..5a7e466ebc 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -15,6 +15,7 @@
 
 import logging
 import sys
+import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
@@ -356,12 +357,17 @@ def run(self):
             if quantize_after_export:
                 from optimum.intel import OVModelForCausalLM, OVQuantizer
 
-                # TODO: remove disabling mmap once OV is updated to 2024.3
                 model = OVModelForCausalLM.from_pretrained(
-                    self.args.output, trust_remote_code=self.args.trust_remote_code, ov_config={"ENABLE_MMAP": "NO"}
+                    self.args.output, trust_remote_code=self.args.trust_remote_code
                 )
                 quantizer = OVQuantizer(model)
                 quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)
-                quantizer.quantize(
-                    ov_config=OVConfig(quantization_config=quantization_config), save_directory=self.args.output
-                )
+                # TODO: set save_directory=self.args.output once OV is updated to 2024.3
+                quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    import shutil
+
+                    model.save_pretrained(temp_dir)
+                    ov_config.save_pretrained(self.args.output)
+                    shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml")
+                    shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin")
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index e61a1c7eca..7937deea52 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -103,7 +103,6 @@ def __init__(
     def load_model(
         file_name: Union[str, Path],
         quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
-        ov_core_properties: Optional[Dict] = None,
     ):
         """
         Loads the model.
@@ -113,8 +112,6 @@ def load_model(
                 The path of the model ONNX or XML file.
             quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*):
                 Quantization config to apply after model is loaded.
-            ov_core_properties (`Dict`, *optional*):
-                OpenVINO core properties to set before model loading.
         """
 
         def fix_op_names_duplicates(model: openvino.runtime.Model):
@@ -131,8 +128,6 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
 
         if isinstance(file_name, str):
             file_name = Path(file_name)
-        if ov_core_properties:
-            core.set_property(ov_core_properties)
         model = core.read_model(file_name) if not file_name.suffix == ".onnx" else convert_model(file_name)
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index cf913c15b1..2ad04ab14a 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -748,16 +748,9 @@ def _from_pretrained(
 
         load_in_4bit = quantization_config.bits == 4 if quantization_config else False
 
-        ov_config = kwargs.get("ov_config", None)
-        ov_core_properties = {}
-        if ov_config and "ENABLE_MMAP" in ov_config:
-            ov_core_properties["ENABLE_MMAP"] = ov_config["ENABLE_MMAP"]
-            del ov_config["ENABLE_MMAP"]
-
         model = cls.load_model(
             model_cache_path,
             quantization_config=None if load_in_4bit else quantization_config,
-            ov_core_properties=ov_core_properties,
         )
 
         model_type = config.model_type.replace("_", "-")

From 0886f7e29cae7bca8b118958c06d282f900b6110 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 23 May 2024 14:15:50 +0200
Subject: [PATCH 12/28] Add saving intermediate weights in fp16; add removal of
 intermediate model if compression fails

---
 optimum/commands/export/openvino.py | 40 +++++++++++++++++------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 5a7e466ebc..d3d49c18bf 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -14,6 +14,7 @@
 """Defines the command line for the export with OpenVINO."""
 
 import logging
+import shutil
 import sys
 import tempfile
 from pathlib import Path
@@ -337,6 +338,9 @@ def run(self):
                 # In order to quantize a text-generation model with a dataset, an instance of OVModelForCausalLM is
                 # required. That's why the quantization is skipped during export and applied explicitly after export.
                 ov_config.quantization_config = None
+                # Export intermediate model with f16 weights to save up disk space
+                original_dtype_value = ov_config.dtype
+                ov_config.dtype = "fp16"
 
             # TODO : add input shapes
             main_export(
@@ -355,19 +359,23 @@ def run(self):
             )
 
             if quantize_after_export:
-                from optimum.intel import OVModelForCausalLM, OVQuantizer
-
-                model = OVModelForCausalLM.from_pretrained(
-                    self.args.output, trust_remote_code=self.args.trust_remote_code
-                )
-                quantizer = OVQuantizer(model)
-                quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)
-                # TODO: set save_directory=self.args.output once OV is updated to 2024.3
-                quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
-                with tempfile.TemporaryDirectory() as temp_dir:
-                    import shutil
-
-                    model.save_pretrained(temp_dir)
-                    ov_config.save_pretrained(self.args.output)
-                    shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml")
-                    shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin")
+                try:
+                    from optimum.intel import OVModelForCausalLM, OVQuantizer
+
+                    ov_config.dtype = original_dtype_value
+                    model = OVModelForCausalLM.from_pretrained(
+                        self.args.output, trust_remote_code=self.args.trust_remote_code
+                    )
+                    quantizer = OVQuantizer(model)
+                    quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)
+                    # TODO: set save_directory=self.args.output once OV is updated to 2024.3
+                    quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
+                    with tempfile.TemporaryDirectory() as temp_dir:
+                        model.save_pretrained(temp_dir)
+                        ov_config.save_pretrained(self.args.output)
+                        shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml")
+                        shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin")
+                except Exception as e:
+                    # Delete non-compressed model if compression failed for some reason
+                    shutil.rmtree(str(self.args.output))
+                    raise e

From ee9b1b7ec05233459485b14cc72a5f028b749ac9 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 23 May 2024 16:16:05 +0200
Subject: [PATCH 13/28] Trigger checks


From cb570682fdba239aa1e9a1813436a2ae329d81d5 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 24 May 2024 10:49:12 +0200
Subject: [PATCH 14/28] Trigger checks


From ee0b67fd096d8b76096b46078bd16eb4ae78f2d2 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 28 May 2024 08:35:27 +0200
Subject: [PATCH 15/28] Trigger checks


From cacbb36d20e63cde32aabfd01d2a01cecb0ea6ac Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 31 May 2024 12:00:28 +0200
Subject: [PATCH 16/28] Fix test

---
 tests/openvino/test_exporters_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 267aa88d62..c81761bc9f 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -217,7 +217,7 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec
             _, num_int8, num_int4 = get_num_quantized_nodes(model)
             self.assertEqual(expected_int8, num_int8)
             self.assertEqual(expected_int4, num_int4)
-            self.assertTrue("--quant-method awq" not in option or b"Applying AWQ" in result.stdout)
+            self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
 
     def test_exporters_cli_help(self):
         subprocess.run(

From 814d96c0d4923465c7c724224654fa90237271fa Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 31 May 2024 12:00:50 +0200
Subject: [PATCH 17/28] Refactor applying quantization with dataset

---
 optimum/commands/export/openvino.py        | 84 ++++++----------------
 optimum/intel/openvino/modeling_base.py    |  1 +
 optimum/intel/openvino/modeling_decoder.py | 21 ++----
 3 files changed, 30 insertions(+), 76 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index d3d49c18bf..62575f5a1f 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -14,9 +14,7 @@
 """Defines the command line for the export with OpenVINO."""
 
 import logging
-import shutil
 import sys
-import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
@@ -276,12 +274,12 @@ def run(self):
         if self.args.convert_tokenizer:
             logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
 
-        if (
-            library_name == "diffusers"
-            and ov_config
-            and ov_config.quantization_config
-            and ov_config.quantization_config.dataset is not None
-        ):
+        quantization_config = ov_config.quantization_config if ov_config else None
+        quantize_with_dataset = quantization_config and getattr(quantization_config, "dataset", None) is not None
+        task = infer_task(self.args.task, self.args.model)
+        model = None
+
+        if library_name == "diffusers" and quantize_with_dataset:
             if not is_diffusers_available():
                 raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models"))
 
@@ -306,42 +304,17 @@ def run(self):
             else:
                 raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")
 
-            model = model_cls.from_pretrained(
-                self.args.model, export=True, quantization_config=ov_config.quantization_config
-            )
+            model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config)
             model.save_pretrained(self.args.output)
+        elif task.startswith("text-generation") and quantize_with_dataset:
+            from optimum.intel import OVModelForCausalLM
 
-            if self.args.disable_convert_tokenizer:
-                return
-
-            # avoid import when using other exporters (IPEX, INC)
-            from ...exporters.openvino.convert import export_tokenizer
-
-            output = Path(self.args.output)
-            tokenizer = getattr(model, "tokenizer", None)
-            if tokenizer is not None:
-                export_tokenizer(tokenizer, output / "tokenizer")
-
-            tokenizer_2 = getattr(model, "tokenizer_2", None)
-            if tokenizer_2 is not None:
-                export_tokenizer(tokenizer_2, output / "tokenizer_2")
-        else:
-            task = infer_task(self.args.task, self.args.model)
-            quantization_config = ov_config.quantization_config if ov_config else None
-            quantize_after_export = (
-                task.startswith("text-generation")
-                and quantization_config
-                and hasattr(quantization_config, "dataset")
-                and quantization_config.dataset is not None
+            # To quantize a text-generation model with a dataset, an instantiated OVModelForCausalLM is required
+            model = OVModelForCausalLM.from_pretrained(
+                self.args.model, export=True, quantization_config=quantization_config
             )
-            if quantize_after_export:
-                # In order to quantize a text-generation model with a dataset, an instance of OVModelForCausalLM is
-                # required. That's why the quantization is skipped during export and applied explicitly after export.
-                ov_config.quantization_config = None
-                # Export intermediate model with f16 weights to save up disk space
-                original_dtype_value = ov_config.dtype
-                ov_config.dtype = "fp16"
-
+            model.save_pretrained(self.args.output)
+        else:
             # TODO : add input shapes
             main_export(
                 model_name_or_path=self.args.model,
@@ -358,24 +331,11 @@ def run(self):
                 # **input_shapes,
             )
 
-            if quantize_after_export:
-                try:
-                    from optimum.intel import OVModelForCausalLM, OVQuantizer
-
-                    ov_config.dtype = original_dtype_value
-                    model = OVModelForCausalLM.from_pretrained(
-                        self.args.output, trust_remote_code=self.args.trust_remote_code
-                    )
-                    quantizer = OVQuantizer(model)
-                    quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)
-                    # TODO: set save_directory=self.args.output once OV is updated to 2024.3
-                    quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
-                    with tempfile.TemporaryDirectory() as temp_dir:
-                        model.save_pretrained(temp_dir)
-                        ov_config.save_pretrained(self.args.output)
-                        shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml")
-                        shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin")
-                except Exception as e:
-                    # Delete non-compressed model if compression failed for some reason
-                    shutil.rmtree(str(self.args.output))
-                    raise e
+        if model and not self.args.disable_convert_tokenizer:
+            # avoid import when using other exporters (IPEX, INC)
+            from ...exporters.openvino.convert import export_tokenizer
+
+            for tokenizer_name in ("tokenizer", "tokenizer_2"):
+                tokenizer = getattr(model, tokenizer_name, None)
+                if tokenizer is not None:
+                    export_tokenizer(tokenizer, self.args.output / tokenizer_name)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 7937deea52..48bf5d344b 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -132,6 +132,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):
         if file_name.suffix == ".onnx":
             model = fix_op_names_duplicates(model)  # should be called during model conversion to IR
 
+        # TODO: remove this way of applying quantization; instead apply it after instance of OVModel* is loaded
         if quantization_config:
             if not is_nncf_available():
                 raise ImportError(
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 2ad04ab14a..44069b0452 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -741,17 +741,7 @@ def _from_pretrained(
             local_files_only=local_files_only,
         )
 
-        if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}:
-            quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config)
-
-        quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
-
-        load_in_4bit = quantization_config.bits == 4 if quantization_config else False
-
-        model = cls.load_model(
-            model_cache_path,
-            quantization_config=None if load_in_4bit else quantization_config,
-        )
+        model = cls.load_model(model_cache_path)
 
         model_type = config.model_type.replace("_", "-")
         if model_type == "bloom":
@@ -761,17 +751,20 @@ def _from_pretrained(
         else:
             init_cls = cls
 
-        enable_compilation = kwargs.pop("compile", True) and not load_in_4bit
+        if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}:
+            quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config)
+        quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
+
+        enable_compilation = kwargs.pop("compile", True) and not quantization_config
         causal_model = init_cls(
             model=model,
             config=config,
             model_save_dir=model_cache_path.parent,
             compile=enable_compilation,
-            quantization_config=quantization_config,
             **kwargs,
         )
 
-        if load_in_4bit:
+        if quantization_config:
             if not is_nncf_available():
                 raise ImportError(
                     "Quantization of the weights requires nncf, please install it with `pip install nncf`"

From d8017ab843f4bb976188ccdf4fd8fc488a5561b5 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 31 May 2024 13:03:19 +0200
Subject: [PATCH 18/28] Bring back quantization_config parameter

---
 optimum/intel/openvino/modeling_decoder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 44069b0452..43de424778 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -761,6 +761,7 @@ def _from_pretrained(
             config=config,
             model_save_dir=model_cache_path.parent,
             compile=enable_compilation,
+            quantization_config=quantization_config,
             **kwargs,
         )
 

From 24272dc5168fe7ee16ad0605f69e1d38b7cdac5c Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 31 May 2024 18:22:46 +0200
Subject: [PATCH 19/28] Trigger checks


From 40b0e29a5410a86a5a105d5b3bde7363428d1f60 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 3 Jun 2024 17:54:07 +0200
Subject: [PATCH 20/28] Apply comment

---
 optimum/commands/export/openvino.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 62575f5a1f..867fb8de93 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -311,7 +311,10 @@ def run(self):
 
             # To quantize a text-generation model with a dataset, an instantiated OVModelForCausalLM is required
             model = OVModelForCausalLM.from_pretrained(
-                self.args.model, export=True, quantization_config=quantization_config
+                self.args.model,
+                export=True,
+                quantization_config=quantization_config,
+                stateful=not self.args.disable_stateful,
             )
             model.save_pretrained(self.args.output)
         else:

From f54aa4061c3863b7b6202bb0498f297d0bae807c Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 4 Jun 2024 16:01:53 +0200
Subject: [PATCH 21/28] Save tokenizer

---
 optimum/commands/export/openvino.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 867fb8de93..869019ba6e 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -315,8 +315,18 @@ def run(self):
                 export=True,
                 quantization_config=quantization_config,
                 stateful=not self.args.disable_stateful,
+                trust_remote_code=self.args.trust_remote_code,
             )
             model.save_pretrained(self.args.output)
+            try:
+                from transformers import AutoTokenizer
+
+                tokenizer = AutoTokenizer.from_pretrained(
+                    self.args.model, trust_remote_code=self.args.trust_remote_code
+                )
+                tokenizer.save_pretrained(self.args.output)
+            except:
+                logger.warning("Could not save tokenizer")
         else:
             # TODO : add input shapes
             main_export(

From 96bed2989ce153c8e25f1827cf41bb4f592454f2 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 4 Jun 2024 16:43:12 +0200
Subject: [PATCH 22/28] Export CausalLM tokenizer

---
 optimum/commands/export/openvino.py | 34 ++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 869019ba6e..08d8d03e7a 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -277,7 +277,6 @@ def run(self):
         quantization_config = ov_config.quantization_config if ov_config else None
         quantize_with_dataset = quantization_config and getattr(quantization_config, "dataset", None) is not None
         task = infer_task(self.args.task, self.args.model)
-        model = None
 
         if library_name == "diffusers" and quantize_with_dataset:
             if not is_diffusers_available():
@@ -306,6 +305,21 @@ def run(self):
 
             model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config)
             model.save_pretrained(self.args.output)
+
+            if self.args.disable_convert_tokenizer:
+                return
+
+            # avoid import when using other exporters (IPEX, INC)
+            from ...exporters.openvino.convert import export_tokenizer
+
+            output = Path(self.args.output)
+            tokenizer = getattr(model, "tokenizer", None)
+            if tokenizer is not None:
+                export_tokenizer(tokenizer, output / "tokenizer")
+
+            tokenizer_2 = getattr(model, "tokenizer_2", None)
+            if tokenizer_2 is not None:
+                export_tokenizer(tokenizer_2, output / "tokenizer_2")
         elif task.startswith("text-generation") and quantize_with_dataset:
             from optimum.intel import OVModelForCausalLM
 
@@ -318,6 +332,8 @@ def run(self):
                 trust_remote_code=self.args.trust_remote_code,
             )
             model.save_pretrained(self.args.output)
+
+            tokenizer = None
             try:
                 from transformers import AutoTokenizer
 
@@ -327,6 +343,13 @@ def run(self):
                 tokenizer.save_pretrained(self.args.output)
             except:
                 logger.warning("Could not save tokenizer")
+
+            if tokenizer and not self.args.disable_convert_tokenizer:
+                from ...exporters.openvino.convert import export_tokenizer
+
+                output = Path(self.args.output)
+                if tokenizer is not None:
+                    export_tokenizer(tokenizer, output / "tokenizer")
         else:
             # TODO : add input shapes
             main_export(
@@ -343,12 +366,3 @@ def run(self):
                 library_name=library_name,
                 # **input_shapes,
             )
-
-        if model and not self.args.disable_convert_tokenizer:
-            # avoid import when using other exporters (IPEX, INC)
-            from ...exporters.openvino.convert import export_tokenizer
-
-            for tokenizer_name in ("tokenizer", "tokenizer_2"):
-                tokenizer = getattr(model, tokenizer_name, None)
-                if tokenizer is not None:
-                    export_tokenizer(tokenizer, self.args.output / tokenizer_name)

From a6005adededb41bd3bd028d03a5cbb66649140fb Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 4 Jun 2024 16:45:43 +0200
Subject: [PATCH 23/28] Remove unneccessary if

---
 optimum/commands/export/openvino.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 08d8d03e7a..7bda9cf7eb 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -348,8 +348,7 @@ def run(self):
                 from ...exporters.openvino.convert import export_tokenizer
 
                 output = Path(self.args.output)
-                if tokenizer is not None:
-                    export_tokenizer(tokenizer, output / "tokenizer")
+                export_tokenizer(tokenizer, output / "tokenizer")
         else:
             # TODO : add input shapes
             main_export(

From e3119169ed46196fe5af41541f928f4e32e96d12 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 4 Jun 2024 16:48:22 +0200
Subject: [PATCH 24/28] Remove extra variable

---
 optimum/commands/export/openvino.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 7bda9cf7eb..ff4d091faf 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -312,14 +312,13 @@ def run(self):
             # avoid import when using other exporters (IPEX, INC)
             from ...exporters.openvino.convert import export_tokenizer
 
-            output = Path(self.args.output)
             tokenizer = getattr(model, "tokenizer", None)
             if tokenizer is not None:
-                export_tokenizer(tokenizer, output / "tokenizer")
+                export_tokenizer(tokenizer, self.args.output / "tokenizer")
 
             tokenizer_2 = getattr(model, "tokenizer_2", None)
             if tokenizer_2 is not None:
-                export_tokenizer(tokenizer_2, output / "tokenizer_2")
+                export_tokenizer(tokenizer_2, self.args.output / "tokenizer_2")
         elif task.startswith("text-generation") and quantize_with_dataset:
             from optimum.intel import OVModelForCausalLM
 
@@ -347,8 +346,7 @@ def run(self):
             if tokenizer and not self.args.disable_convert_tokenizer:
                 from ...exporters.openvino.convert import export_tokenizer
 
-                output = Path(self.args.output)
-                export_tokenizer(tokenizer, output / "tokenizer")
+                export_tokenizer(tokenizer, self.args.output / "tokenizer")
         else:
             # TODO : add input shapes
             main_export(

From fc4421482a083d5e0619bbeaaa18ae6093ecfac8 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 4 Jun 2024 16:52:17 +0200
Subject: [PATCH 25/28] ruff

---
 optimum/commands/export/openvino.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index ff4d091faf..7b43608e74 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -340,7 +340,7 @@ def run(self):
                     self.args.model, trust_remote_code=self.args.trust_remote_code
                 )
                 tokenizer.save_pretrained(self.args.output)
-            except:
+            except Exception as e:
                 logger.warning("Could not save tokenizer")
 
             if tokenizer and not self.args.disable_convert_tokenizer:

From 709085be2fd8295dded155a8e97dc98ef6a57fe6 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 4 Jun 2024 17:03:39 +0200
Subject: [PATCH 26/28] Ruff 2

---
 optimum/commands/export/openvino.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 7b43608e74..21f41e8961 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -340,7 +340,7 @@ def run(self):
                     self.args.model, trust_remote_code=self.args.trust_remote_code
                 )
                 tokenizer.save_pretrained(self.args.output)
-            except Exception as e:
+            except Exception:
                 logger.warning("Could not save tokenizer")
 
             if tokenizer and not self.args.disable_convert_tokenizer:

From a2084d939928b8252feb3c09a9ba6b50b239be6f Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 5 Jun 2024 13:52:34 +0200
Subject: [PATCH 27/28] Introduce a separate function to tokenizer conversion

---
 optimum/commands/export/openvino.py    | 41 +++++--------------
 optimum/exporters/openvino/__main__.py | 56 +++++++++++++++-----------
 2 files changed, 43 insertions(+), 54 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 21f41e8961..631e30c5af 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -19,9 +19,11 @@
 from typing import TYPE_CHECKING, Optional
 
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from transformers.utils.quantization_config import QuantizationMethod
 
 from ...exporters import TasksManager
 from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
+from ...utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
 from ..base import BaseOptimumCLICommand, CommandInfo
 
 
@@ -207,7 +209,7 @@ def parse_args(parser: "ArgumentParser"):
         return parse_args_openvino(parser)
 
     def run(self):
-        from ...exporters.openvino.__main__ import infer_task, main_export
+        from ...exporters.openvino.__main__ import infer_task, main_export, maybe_convert_tokenizers
         from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig
 
         if self.args.fp16:
@@ -251,7 +253,7 @@ def run(self):
                     "all_layers": None if is_int8 else self.args.all_layers,
                     "dataset": self.args.dataset,
                     "num_samples": self.args.num_samples,
-                    "quant_method": "awq" if self.args.awq else None,
+                    "quant_method": QuantizationMethod.AWQ if self.args.awq else None,
                     "sensitivity_metric": self.args.sensitivity_metric,
                 }
 
@@ -305,20 +307,8 @@ def run(self):
 
             model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config)
             model.save_pretrained(self.args.output)
-
-            if self.args.disable_convert_tokenizer:
-                return
-
-            # avoid import when using other exporters (IPEX, INC)
-            from ...exporters.openvino.convert import export_tokenizer
-
-            tokenizer = getattr(model, "tokenizer", None)
-            if tokenizer is not None:
-                export_tokenizer(tokenizer, self.args.output / "tokenizer")
-
-            tokenizer_2 = getattr(model, "tokenizer_2", None)
-            if tokenizer_2 is not None:
-                export_tokenizer(tokenizer_2, self.args.output / "tokenizer_2")
+            if not self.args.disable_convert_tokenizer:
+                maybe_convert_tokenizers(library_name, self.args.output, model)
         elif task.startswith("text-generation") and quantize_with_dataset:
             from optimum.intel import OVModelForCausalLM
 
@@ -332,21 +322,10 @@ def run(self):
             )
             model.save_pretrained(self.args.output)
 
-            tokenizer = None
-            try:
-                from transformers import AutoTokenizer
-
-                tokenizer = AutoTokenizer.from_pretrained(
-                    self.args.model, trust_remote_code=self.args.trust_remote_code
-                )
-                tokenizer.save_pretrained(self.args.output)
-            except Exception:
-                logger.warning("Could not save tokenizer")
-
-            if tokenizer and not self.args.disable_convert_tokenizer:
-                from ...exporters.openvino.convert import export_tokenizer
-
-                export_tokenizer(tokenizer, self.args.output / "tokenizer")
+            maybe_save_preprocessors(self.args.model, self.args.output, trust_remote_code=self.args.trust_remote_code)
+            if not self.args.disable_convert_tokenizer:
+                preprocessors = maybe_load_preprocessors(self.args.model, trust_remote_code=self.args.trust_remote_code)
+                maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors)
         else:
             # TODO : add input shapes
             main_export(
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 1204c8d4cf..927c98ac37 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -364,17 +364,35 @@ class StoreAttr(object):
         **kwargs_shapes,
     )
 
-    # hide openvino import when using other exporters
-    from optimum.exporters.openvino.convert import export_tokenizer
+    if convert_tokenizer:
+        maybe_convert_tokenizers(library_name, output, model, preprocessors)
 
-    if convert_tokenizer and is_openvino_tokenizers_available():
-        if library_name != "diffusers":
-            tokenizer = next(
-                (preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)),
-                None,
-            )
+    # Unpatch modules after GPTQ export
+    if do_gptq_patching:
+        torch.cuda.is_available = orig_cuda_check
+        GPTQQuantizer.post_init_model = orig_post_init_model
+
+
+def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None):
+    """
+    Tries to convert tokenizers to OV format and export them to disk.
+
+    Arguments:
+        library_name (`str`):
+            The library name.
+        output (`Path`):
+            Path to save converted tokenizers to.
+        model (`PreTrainedModel`, *optional*, defaults to None):
+            Model instance.
+        preprocessors (`Iterable`, *optional*, defaults to None):
+            Iterable possibly containing tokenizers to be converted.
+    """
+    from optimum.exporters.openvino.convert import export_tokenizer
 
-            if tokenizer is not None:
+    if is_openvino_tokenizers_available():
+        if library_name != "diffusers" and preprocessors:
+            tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None)
+            if tokenizer:
                 try:
                     export_tokenizer(tokenizer, output)
                 except Exception as exception:
@@ -382,18 +400,10 @@ class StoreAttr(object):
                         "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
                         f"models won't be generated. Exception: {exception}"
                     )
-        else:
-            tokenizer = getattr(model, "tokenizer", None)
-            if tokenizer is not None:
-                export_tokenizer(tokenizer, output / "tokenizer")
-
-            tokenizer_2 = getattr(model, "tokenizer_2", None)
-            if tokenizer_2 is not None:
-                export_tokenizer(tokenizer_2, output / "tokenizer_2")
-    elif convert_tokenizer and not is_openvino_tokenizers_available():
+        elif model:
+            for tokenizer_name in ("tokenizer", "tokenizer_2"):
+                tokenizer = getattr(model, tokenizer_name, None)
+                if tokenizer:
+                    export_tokenizer(tokenizer, output / tokenizer_name)
+    else:
         logger.warning("Tokenizer won't be converted.")
-
-    # Unpatch modules after GPTQ export
-    if do_gptq_patching:
-        torch.cuda.is_available = orig_cuda_check
-        GPTQQuantizer.post_init_model = orig_post_init_model

From e8cc0e9e84b80110e20989dcb114fd2522a10f04 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 5 Jun 2024 13:57:23 +0200
Subject: [PATCH 28/28] Black

---
 optimum/commands/export/openvino.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 631e30c5af..07e1dcffae 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -324,7 +324,9 @@ def run(self):
 
             maybe_save_preprocessors(self.args.model, self.args.output, trust_remote_code=self.args.trust_remote_code)
             if not self.args.disable_convert_tokenizer:
-                preprocessors = maybe_load_preprocessors(self.args.model, trust_remote_code=self.args.trust_remote_code)
+                preprocessors = maybe_load_preprocessors(
+                    self.args.model, trust_remote_code=self.args.trust_remote_code
+                )
                 maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors)
         else:
             # TODO : add input shapes