From 56878bb0e108338f11cd6a4a37d0d81b0bb060cb Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 21 May 2024 14:53:16 +0200 Subject: [PATCH 01/28] Add quantization with dataset after model export for text-generation models --- optimum/commands/export/openvino.py | 63 +++++++++++++++++++++++++- optimum/exporters/openvino/__main__.py | 30 ++++++------ tests/openvino/test_exporters_cli.py | 12 ++++- 3 files changed, 89 insertions(+), 16 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index ffd084d4e6..310ec30c97 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -15,6 +15,7 @@ import logging import sys +import tempfile from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -128,6 +129,29 @@ def parse_args_openvino(parser: "ArgumentParser"): "compression is applied, they are compressed to INT8." ), ) + optional_group.add_argument( + "--quant-method", + type=str, + default=None, + choices=["default", "awq", "hybrid"], + help=("The quantization method to apply. Can be one of the following: ['default', 'awq', 'hybrid']."), + ) + optional_group.add_argument( + "--sensitivity-metric", + type=str, + default=None, + help=( + "The sensitivity metric for assigning quantization precision to layers. Can be one of the following: " + "['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', " + "'max_activation_variance', 'mean_activation_magnitude']." + ), + ) + optional_group.add_argument( + "--num-samples", + type=int, + default=None, + help=("The maximum number of samples composing the calibration dataset for quantization."), + ) optional_group.add_argument( "--disable-stateful", action="store_true", @@ -180,7 +204,7 @@ def parse_args(parser: "ArgumentParser"): return parse_args_openvino(parser) def run(self): - from ...exporters.openvino.__main__ import main_export + from ...exporters.openvino.__main__ import main_export, infer_task from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig if self.args.fp16: @@ -208,6 +232,9 @@ def run(self): and self.args.group_size is None and self.args.sym is None and self.args.all_layers is None + and self.args.dataset is None + and self.args.quant_method is None + and self.args.sensitivity_metric is None and self.args.model in _DEFAULT_4BIT_CONFIGS ): quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model] @@ -218,6 +245,10 @@ def run(self): "sym": self.args.sym or False, "group_size": -1 if is_int8 else self.args.group_size, "all_layers": None if is_int8 else self.args.all_layers, + "dataset": self.args.dataset, + "num_samples": self.args.num_samples, + "quant_method": self.args.quant_method, + "sensitivity_metric": self.args.sensitivity_metric, } if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}: @@ -226,7 +257,6 @@ def run(self): ) quantization_config["sym"] = "asym" not in self.args.weight_format quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64 - quantization_config["dataset"] = self.args.dataset ov_config = OVConfig(quantization_config=quantization_config) library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library) @@ -290,6 +320,19 @@ def run(self): if tokenizer_2 is not None: export_tokenizer(tokenizer_2, output / "tokenizer_2") else: + task = infer_task(self.args.task, self.args.model) + quantization_config = ov_config.quantization_config + quantize_after_export = ( + task.startswith("text-generation") + and quantization_config is not None + and hasattr(quantization_config, "dataset") + and quantization_config.dataset is not None + ) + if quantize_after_export: + # In order to quantize a text-generation model with a dataset, an instance of OVModelForCausalLM is + # required. That's why the quantization is skipped during export and applied explicitly after export. + ov_config.quantization_config = None + # TODO : add input shapes main_export( model_name_or_path=self.args.model, @@ -305,3 +348,19 @@ def run(self): library_name=library_name, # **input_shapes, ) + + if quantize_after_export: + from optimum.intel import OVModelForCausalLM, OVQuantizer + + model = OVModelForCausalLM.from_pretrained(self.args.output) + quantizer = OVQuantizer(model) + quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output) + # TODO: set save_directory=self.args.output once OV is updated to 2024.3 + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) + with tempfile.TemporaryDirectory() as temp_dir: + import shutil + + model.save_pretrained(temp_dir) + ov_config.save_pretrained(self.args.output) + shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml") + shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin") diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 9db6719069..1204c8d4cf 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -44,6 +44,22 @@ logger = logging.getLogger(__name__) +def infer_task(task, model_name_or_path): + task = TasksManager.map_from_synonym(task) + if task == "auto": + try: + task = TasksManager.infer_task_from_model(model_name_or_path) + except KeyError as e: + raise KeyError( + f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" + ) + except RequestsConnectionError as e: + raise RequestsConnectionError( + f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" + ) + return task + + def main_export( model_name_or_path: str, output: Union[str, Path], @@ -174,7 +190,7 @@ def main_export( ov_config = OVConfig(quantization_config=q_config) original_task = task - task = TasksManager.map_from_synonym(task) + task = infer_task(task, model_name_or_path) framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework) library_name_is_not_provided = library_name is None library_name = TasksManager.infer_library_from_model( @@ -188,18 +204,6 @@ def main_export( ) library_name = "transformers" - if task == "auto": - try: - task = TasksManager.infer_task_from_model(model_name_or_path) - except KeyError as e: - raise KeyError( - f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" - ) - except RequestsConnectionError as e: - raise RequestsConnectionError( - f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" - ) - do_gptq_patching = False custom_architecture = False loading_kwargs = {} diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index cce25bbae1..21ced61edb 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -89,6 +89,14 @@ class OVCLIExportTestCase(unittest.TestCase): ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86), ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86), ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32), + ( + "text-generation-with-past", + "llama_awq", + "int4 --ratio 1.0 --sym --group-size 16 --quant-method awq --dataset wikitext2 --num-samples 100 " + "--sensitivity-metric max_activation_variance", + 4, + 28, + ), ] def _openvino_export( @@ -197,10 +205,11 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in @parameterized.expand(TEST_4BIT_CONFIGURATONS) def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int): with TemporaryDirectory() as tmpdir: - subprocess.run( + result = subprocess.run( f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", shell=True, check=True, + capture_output=True, ) model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {} model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs) @@ -208,6 +217,7 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec _, num_int8, num_int4 = get_num_quantized_nodes(model) self.assertEqual(expected_int8, num_int8) self.assertEqual(expected_int4, num_int4) + self.assertTrue("--quant-method awq" not in option or b"Applying AWQ" in result.stdout) def test_exporters_cli_help(self): subprocess.run( From 013a0f656fc015db97bbaabb4cc79634e2cd98db Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 21 May 2024 15:22:46 +0200 Subject: [PATCH 02/28] Tweak AWQ CLI interface --- optimum/commands/export/openvino.py | 12 +++++------- tests/openvino/test_exporters_cli.py | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 310ec30c97..d6d0114d93 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -130,11 +130,10 @@ def parse_args_openvino(parser: "ArgumentParser"): ), ) optional_group.add_argument( - "--quant-method", - type=str, + "--awq", + action="store_true", default=None, - choices=["default", "awq", "hybrid"], - help=("The quantization method to apply. Can be one of the following: ['default', 'awq', 'hybrid']."), + help="Whether to apply AWQ algorithm. To run AWQ, please also provide a dataset.", ) optional_group.add_argument( "--sensitivity-metric", @@ -150,7 +149,7 @@ def parse_args_openvino(parser: "ArgumentParser"): "--num-samples", type=int, default=None, - help=("The maximum number of samples composing the calibration dataset for quantization."), + help="The maximum number of samples to take from the dataset for quantization.", ) optional_group.add_argument( "--disable-stateful", @@ -233,7 +232,6 @@ def run(self): and self.args.sym is None and self.args.all_layers is None and self.args.dataset is None - and self.args.quant_method is None and self.args.sensitivity_metric is None and self.args.model in _DEFAULT_4BIT_CONFIGS ): @@ -247,7 +245,7 @@ def run(self): "all_layers": None if is_int8 else self.args.all_layers, "dataset": self.args.dataset, "num_samples": self.args.num_samples, - "quant_method": self.args.quant_method, + "quant_method": "awq" if self.args.awq else None, "sensitivity_metric": self.args.sensitivity_metric, } diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 21ced61edb..267aa88d62 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -92,7 +92,7 @@ class OVCLIExportTestCase(unittest.TestCase): ( "text-generation-with-past", "llama_awq", - "int4 --ratio 1.0 --sym --group-size 16 --quant-method awq --dataset wikitext2 --num-samples 100 " + "int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 " "--sensitivity-metric max_activation_variance", 4, 28, From c566ccc094d4999dba3888a55ea43520e08850f5 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 21 May 2024 15:24:45 +0200 Subject: [PATCH 03/28] Additional checks --- optimum/commands/export/openvino.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index d6d0114d93..4594cb387e 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -232,6 +232,8 @@ def run(self): and self.args.sym is None and self.args.all_layers is None and self.args.dataset is None + and self.args.num_samples is None + and self.args.awq is None and self.args.sensitivity_metric is None and self.args.model in _DEFAULT_4BIT_CONFIGS ): From 0a8fba022c92058c923f5af46370c53419a28df5 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 21 May 2024 16:26:04 +0200 Subject: [PATCH 04/28] Fix --- optimum/commands/export/openvino.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 4594cb387e..42f09c1576 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -203,7 +203,7 @@ def parse_args(parser: "ArgumentParser"): return parse_args_openvino(parser) def run(self): - from ...exporters.openvino.__main__ import main_export, infer_task + from ...exporters.openvino.__main__ import infer_task, main_export from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig if self.args.fp16: @@ -321,10 +321,10 @@ def run(self): export_tokenizer(tokenizer_2, output / "tokenizer_2") else: task = infer_task(self.args.task, self.args.model) - quantization_config = ov_config.quantization_config + quantization_config = ov_config.quantization_config if ov_config else None quantize_after_export = ( task.startswith("text-generation") - and quantization_config is not None + and quantization_config and hasattr(quantization_config, "dataset") and quantization_config.dataset is not None ) From 6dbb4fe8c1bca94f3a240ec9e0e312a58b854d18 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 21 May 2024 19:00:15 +0200 Subject: [PATCH 05/28] Trigger Build From 3722624bf011fe651d478fe0b4c21c5b7e729486 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 22 May 2024 15:43:54 +0200 Subject: [PATCH 06/28] Add AWQ description --- optimum/commands/export/openvino.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 42f09c1576..758b6ae65d 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -133,7 +133,10 @@ def parse_args_openvino(parser: "ArgumentParser"): "--awq", action="store_true", default=None, - help="Whether to apply AWQ algorithm. To run AWQ, please also provide a dataset.", + help=( + "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires " + "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset." + ), ) optional_group.add_argument( "--sensitivity-metric", From dee582d2abb0087564bce0ff7e62511fa8328048 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 22 May 2024 16:37:26 +0200 Subject: [PATCH 07/28] Add trust remote code argument --- optimum/commands/export/openvino.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 758b6ae65d..549b52c750 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -355,7 +355,10 @@ def run(self): if quantize_after_export: from optimum.intel import OVModelForCausalLM, OVQuantizer - model = OVModelForCausalLM.from_pretrained(self.args.output) + model = OVModelForCausalLM.from_pretrained( + self.args.output, + trust_remote_code=self.args.trust_remote_code + ) quantizer = OVQuantizer(model) quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output) # TODO: set save_directory=self.args.output once OV is updated to 2024.3 From a44c0960ec00966d4b36ecf7a49ab9bfd24b5e61 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 22 May 2024 16:47:14 +0200 Subject: [PATCH 08/28] Black --- optimum/commands/export/openvino.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 549b52c750..e72b876212 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -356,8 +356,7 @@ def run(self): from optimum.intel import OVModelForCausalLM, OVQuantizer model = OVModelForCausalLM.from_pretrained( - self.args.output, - trust_remote_code=self.args.trust_remote_code + self.args.output, trust_remote_code=self.args.trust_remote_code ) quantizer = OVQuantizer(model) quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output) From 12dc6720ef097e29fac44612a936ce2fc9478a88 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 22 May 2024 17:03:32 +0200 Subject: [PATCH 09/28] Add note about possibility of skipping AWQ --- optimum/commands/export/openvino.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index e72b876212..5a7e466ebc 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -135,7 +135,9 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=( "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires " - "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset." + "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset " + "argument. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in such " + "case it will be skipped." ), ) optional_group.add_argument( From bcc46652a8c47f09dd92d6a0ec4185c8da20ddb1 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 23 May 2024 13:28:17 +0200 Subject: [PATCH 10/28] Removed saving to temporary directory; added core property handling for OVModelForCausalLM --- optimum/commands/export/openvino.py | 16 +++++----------- optimum/intel/openvino/modeling_base.py | 5 +++++ optimum/intel/openvino/modeling_decoder.py | 7 +++++++ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 5a7e466ebc..1276340871 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -15,7 +15,6 @@ import logging import sys -import tempfile from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -357,17 +356,12 @@ def run(self): if quantize_after_export: from optimum.intel import OVModelForCausalLM, OVQuantizer + # TODO: remove disabling mmap once OV is updated to 2024.3 model = OVModelForCausalLM.from_pretrained( - self.args.output, trust_remote_code=self.args.trust_remote_code + self.args.output, trust_remote_code=self.args.trust_remote_code, ov_config={"ENABLE_MMAP": "NO"} ) quantizer = OVQuantizer(model) quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output) - # TODO: set save_directory=self.args.output once OV is updated to 2024.3 - quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) - with tempfile.TemporaryDirectory() as temp_dir: - import shutil - - model.save_pretrained(temp_dir) - ov_config.save_pretrained(self.args.output) - shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml") - shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin") + quantizer.quantize( + ov_config=OVConfig(quantization_config=quantization_config), save_directory=self.args.output + ) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 7937deea52..e61a1c7eca 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -103,6 +103,7 @@ def __init__( def load_model( file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + ov_core_properties: Optional[Dict] = None, ): """ Loads the model. @@ -112,6 +113,8 @@ def load_model( The path of the model ONNX or XML file. quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*): Quantization config to apply after model is loaded. + ov_core_properties (`Dict`, *optional*): + OpenVINO core properties to set before model loading. """ def fix_op_names_duplicates(model: openvino.runtime.Model): @@ -128,6 +131,8 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): if isinstance(file_name, str): file_name = Path(file_name) + if ov_core_properties: + core.set_property(ov_core_properties) model = core.read_model(file_name) if not file_name.suffix == ".onnx" else convert_model(file_name) if file_name.suffix == ".onnx": model = fix_op_names_duplicates(model) # should be called during model conversion to IR diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 2ad04ab14a..cf913c15b1 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -748,9 +748,16 @@ def _from_pretrained( load_in_4bit = quantization_config.bits == 4 if quantization_config else False + ov_config = kwargs.get("ov_config", None) + ov_core_properties = {} + if ov_config and "ENABLE_MMAP" in ov_config: + ov_core_properties["ENABLE_MMAP"] = ov_config["ENABLE_MMAP"] + del ov_config["ENABLE_MMAP"] + model = cls.load_model( model_cache_path, quantization_config=None if load_in_4bit else quantization_config, + ov_core_properties=ov_core_properties, ) model_type = config.model_type.replace("_", "-") From 40058dad55b7fe010cf03c62ae7b5ac8361d1847 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 23 May 2024 13:48:02 +0200 Subject: [PATCH 11/28] Revert "Removed saving to temporary directory; added core property handling for OVModelForCausalLM" This reverts commit bcc46652a8c47f09dd92d6a0ec4185c8da20ddb1. --- optimum/commands/export/openvino.py | 16 +++++++++++----- optimum/intel/openvino/modeling_base.py | 5 ----- optimum/intel/openvino/modeling_decoder.py | 7 ------- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 1276340871..5a7e466ebc 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -15,6 +15,7 @@ import logging import sys +import tempfile from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -356,12 +357,17 @@ def run(self): if quantize_after_export: from optimum.intel import OVModelForCausalLM, OVQuantizer - # TODO: remove disabling mmap once OV is updated to 2024.3 model = OVModelForCausalLM.from_pretrained( - self.args.output, trust_remote_code=self.args.trust_remote_code, ov_config={"ENABLE_MMAP": "NO"} + self.args.output, trust_remote_code=self.args.trust_remote_code ) quantizer = OVQuantizer(model) quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output) - quantizer.quantize( - ov_config=OVConfig(quantization_config=quantization_config), save_directory=self.args.output - ) + # TODO: set save_directory=self.args.output once OV is updated to 2024.3 + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) + with tempfile.TemporaryDirectory() as temp_dir: + import shutil + + model.save_pretrained(temp_dir) + ov_config.save_pretrained(self.args.output) + shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml") + shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin") diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index e61a1c7eca..7937deea52 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -103,7 +103,6 @@ def __init__( def load_model( file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, - ov_core_properties: Optional[Dict] = None, ): """ Loads the model. @@ -113,8 +112,6 @@ def load_model( The path of the model ONNX or XML file. quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*): Quantization config to apply after model is loaded. - ov_core_properties (`Dict`, *optional*): - OpenVINO core properties to set before model loading. """ def fix_op_names_duplicates(model: openvino.runtime.Model): @@ -131,8 +128,6 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): if isinstance(file_name, str): file_name = Path(file_name) - if ov_core_properties: - core.set_property(ov_core_properties) model = core.read_model(file_name) if not file_name.suffix == ".onnx" else convert_model(file_name) if file_name.suffix == ".onnx": model = fix_op_names_duplicates(model) # should be called during model conversion to IR diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index cf913c15b1..2ad04ab14a 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -748,16 +748,9 @@ def _from_pretrained( load_in_4bit = quantization_config.bits == 4 if quantization_config else False - ov_config = kwargs.get("ov_config", None) - ov_core_properties = {} - if ov_config and "ENABLE_MMAP" in ov_config: - ov_core_properties["ENABLE_MMAP"] = ov_config["ENABLE_MMAP"] - del ov_config["ENABLE_MMAP"] - model = cls.load_model( model_cache_path, quantization_config=None if load_in_4bit else quantization_config, - ov_core_properties=ov_core_properties, ) model_type = config.model_type.replace("_", "-") From 0886f7e29cae7bca8b118958c06d282f900b6110 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 23 May 2024 14:15:50 +0200 Subject: [PATCH 12/28] Add saving intermediate weights in fp16; add removal of intermediate model if compression fails --- optimum/commands/export/openvino.py | 40 +++++++++++++++++------------ 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 5a7e466ebc..d3d49c18bf 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -14,6 +14,7 @@ """Defines the command line for the export with OpenVINO.""" import logging +import shutil import sys import tempfile from pathlib import Path @@ -337,6 +338,9 @@ def run(self): # In order to quantize a text-generation model with a dataset, an instance of OVModelForCausalLM is # required. That's why the quantization is skipped during export and applied explicitly after export. ov_config.quantization_config = None + # Export intermediate model with f16 weights to save up disk space + original_dtype_value = ov_config.dtype + ov_config.dtype = "fp16" # TODO : add input shapes main_export( @@ -355,19 +359,23 @@ def run(self): ) if quantize_after_export: - from optimum.intel import OVModelForCausalLM, OVQuantizer - - model = OVModelForCausalLM.from_pretrained( - self.args.output, trust_remote_code=self.args.trust_remote_code - ) - quantizer = OVQuantizer(model) - quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output) - # TODO: set save_directory=self.args.output once OV is updated to 2024.3 - quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) - with tempfile.TemporaryDirectory() as temp_dir: - import shutil - - model.save_pretrained(temp_dir) - ov_config.save_pretrained(self.args.output) - shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml") - shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin") + try: + from optimum.intel import OVModelForCausalLM, OVQuantizer + + ov_config.dtype = original_dtype_value + model = OVModelForCausalLM.from_pretrained( + self.args.output, trust_remote_code=self.args.trust_remote_code + ) + quantizer = OVQuantizer(model) + quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output) + # TODO: set save_directory=self.args.output once OV is updated to 2024.3 + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) + with tempfile.TemporaryDirectory() as temp_dir: + model.save_pretrained(temp_dir) + ov_config.save_pretrained(self.args.output) + shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml") + shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin") + except Exception as e: + # Delete non-compressed model if compression failed for some reason + shutil.rmtree(str(self.args.output)) + raise e From ee9b1b7ec05233459485b14cc72a5f028b749ac9 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 23 May 2024 16:16:05 +0200 Subject: [PATCH 13/28] Trigger checks From cb570682fdba239aa1e9a1813436a2ae329d81d5 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 24 May 2024 10:49:12 +0200 Subject: [PATCH 14/28] Trigger checks From ee0b67fd096d8b76096b46078bd16eb4ae78f2d2 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 28 May 2024 08:35:27 +0200 Subject: [PATCH 15/28] Trigger checks From cacbb36d20e63cde32aabfd01d2a01cecb0ea6ac Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 31 May 2024 12:00:28 +0200 Subject: [PATCH 16/28] Fix test --- tests/openvino/test_exporters_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 267aa88d62..c81761bc9f 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -217,7 +217,7 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec _, num_int8, num_int4 = get_num_quantized_nodes(model) self.assertEqual(expected_int8, num_int8) self.assertEqual(expected_int4, num_int4) - self.assertTrue("--quant-method awq" not in option or b"Applying AWQ" in result.stdout) + self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout) def test_exporters_cli_help(self): subprocess.run( From 814d96c0d4923465c7c724224654fa90237271fa Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 31 May 2024 12:00:50 +0200 Subject: [PATCH 17/28] Refactor applying quantization with dataset --- optimum/commands/export/openvino.py | 84 ++++++---------------- optimum/intel/openvino/modeling_base.py | 1 + optimum/intel/openvino/modeling_decoder.py | 21 ++---- 3 files changed, 30 insertions(+), 76 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index d3d49c18bf..62575f5a1f 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -14,9 +14,7 @@ """Defines the command line for the export with OpenVINO.""" import logging -import shutil import sys -import tempfile from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -276,12 +274,12 @@ def run(self): if self.args.convert_tokenizer: logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.") - if ( - library_name == "diffusers" - and ov_config - and ov_config.quantization_config - and ov_config.quantization_config.dataset is not None - ): + quantization_config = ov_config.quantization_config if ov_config else None + quantize_with_dataset = quantization_config and getattr(quantization_config, "dataset", None) is not None + task = infer_task(self.args.task, self.args.model) + model = None + + if library_name == "diffusers" and quantize_with_dataset: if not is_diffusers_available(): raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models")) @@ -306,42 +304,17 @@ def run(self): else: raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") - model = model_cls.from_pretrained( - self.args.model, export=True, quantization_config=ov_config.quantization_config - ) + model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config) model.save_pretrained(self.args.output) + elif task.startswith("text-generation") and quantize_with_dataset: + from optimum.intel import OVModelForCausalLM - if self.args.disable_convert_tokenizer: - return - - # avoid import when using other exporters (IPEX, INC) - from ...exporters.openvino.convert import export_tokenizer - - output = Path(self.args.output) - tokenizer = getattr(model, "tokenizer", None) - if tokenizer is not None: - export_tokenizer(tokenizer, output / "tokenizer") - - tokenizer_2 = getattr(model, "tokenizer_2", None) - if tokenizer_2 is not None: - export_tokenizer(tokenizer_2, output / "tokenizer_2") - else: - task = infer_task(self.args.task, self.args.model) - quantization_config = ov_config.quantization_config if ov_config else None - quantize_after_export = ( - task.startswith("text-generation") - and quantization_config - and hasattr(quantization_config, "dataset") - and quantization_config.dataset is not None + # To quantize a text-generation model with a dataset, an instantiated OVModelForCausalLM is required + model = OVModelForCausalLM.from_pretrained( + self.args.model, export=True, quantization_config=quantization_config ) - if quantize_after_export: - # In order to quantize a text-generation model with a dataset, an instance of OVModelForCausalLM is - # required. That's why the quantization is skipped during export and applied explicitly after export. - ov_config.quantization_config = None - # Export intermediate model with f16 weights to save up disk space - original_dtype_value = ov_config.dtype - ov_config.dtype = "fp16" - + model.save_pretrained(self.args.output) + else: # TODO : add input shapes main_export( model_name_or_path=self.args.model, @@ -358,24 +331,11 @@ def run(self): # **input_shapes, ) - if quantize_after_export: - try: - from optimum.intel import OVModelForCausalLM, OVQuantizer - - ov_config.dtype = original_dtype_value - model = OVModelForCausalLM.from_pretrained( - self.args.output, trust_remote_code=self.args.trust_remote_code - ) - quantizer = OVQuantizer(model) - quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output) - # TODO: set save_directory=self.args.output once OV is updated to 2024.3 - quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) - with tempfile.TemporaryDirectory() as temp_dir: - model.save_pretrained(temp_dir) - ov_config.save_pretrained(self.args.output) - shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml") - shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin") - except Exception as e: - # Delete non-compressed model if compression failed for some reason - shutil.rmtree(str(self.args.output)) - raise e + if model and not self.args.disable_convert_tokenizer: + # avoid import when using other exporters (IPEX, INC) + from ...exporters.openvino.convert import export_tokenizer + + for tokenizer_name in ("tokenizer", "tokenizer_2"): + tokenizer = getattr(model, tokenizer_name, None) + if tokenizer is not None: + export_tokenizer(tokenizer, self.args.output / tokenizer_name) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 7937deea52..48bf5d344b 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -132,6 +132,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): if file_name.suffix == ".onnx": model = fix_op_names_duplicates(model) # should be called during model conversion to IR + # TODO: remove this way of applying quantization; instead apply it after instance of OVModel* is loaded if quantization_config: if not is_nncf_available(): raise ImportError( diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 2ad04ab14a..44069b0452 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -741,17 +741,7 @@ def _from_pretrained( local_files_only=local_files_only, ) - if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}: - quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config) - - quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) - - load_in_4bit = quantization_config.bits == 4 if quantization_config else False - - model = cls.load_model( - model_cache_path, - quantization_config=None if load_in_4bit else quantization_config, - ) + model = cls.load_model(model_cache_path) model_type = config.model_type.replace("_", "-") if model_type == "bloom": @@ -761,17 +751,20 @@ def _from_pretrained( else: init_cls = cls - enable_compilation = kwargs.pop("compile", True) and not load_in_4bit + if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}: + quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config) + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) + + enable_compilation = kwargs.pop("compile", True) and not quantization_config causal_model = init_cls( model=model, config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, - quantization_config=quantization_config, **kwargs, ) - if load_in_4bit: + if quantization_config: if not is_nncf_available(): raise ImportError( "Quantization of the weights requires nncf, please install it with `pip install nncf`" From d8017ab843f4bb976188ccdf4fd8fc488a5561b5 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 31 May 2024 13:03:19 +0200 Subject: [PATCH 18/28] Bring back quantization_config parameter --- optimum/intel/openvino/modeling_decoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 44069b0452..43de424778 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -761,6 +761,7 @@ def _from_pretrained( config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, + quantization_config=quantization_config, **kwargs, ) From 24272dc5168fe7ee16ad0605f69e1d38b7cdac5c Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 31 May 2024 18:22:46 +0200 Subject: [PATCH 19/28] Trigger checks From 40b0e29a5410a86a5a105d5b3bde7363428d1f60 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 3 Jun 2024 17:54:07 +0200 Subject: [PATCH 20/28] Apply comment --- optimum/commands/export/openvino.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 62575f5a1f..867fb8de93 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -311,7 +311,10 @@ def run(self): # To quantize a text-generation model with a dataset, an instantiated OVModelForCausalLM is required model = OVModelForCausalLM.from_pretrained( - self.args.model, export=True, quantization_config=quantization_config + self.args.model, + export=True, + quantization_config=quantization_config, + stateful=not self.args.disable_stateful, ) model.save_pretrained(self.args.output) else: From f54aa4061c3863b7b6202bb0498f297d0bae807c Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 4 Jun 2024 16:01:53 +0200 Subject: [PATCH 21/28] Save tokenizer --- optimum/commands/export/openvino.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 867fb8de93..869019ba6e 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -315,8 +315,18 @@ def run(self): export=True, quantization_config=quantization_config, stateful=not self.args.disable_stateful, + trust_remote_code=self.args.trust_remote_code, ) model.save_pretrained(self.args.output) + try: + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained( + self.args.model, trust_remote_code=self.args.trust_remote_code + ) + tokenizer.save_pretrained(self.args.output) + except: + logger.warning("Could not save tokenizer") else: # TODO : add input shapes main_export( From 96bed2989ce153c8e25f1827cf41bb4f592454f2 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 4 Jun 2024 16:43:12 +0200 Subject: [PATCH 22/28] Export CausalLM tokenizer --- optimum/commands/export/openvino.py | 34 ++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 869019ba6e..08d8d03e7a 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -277,7 +277,6 @@ def run(self): quantization_config = ov_config.quantization_config if ov_config else None quantize_with_dataset = quantization_config and getattr(quantization_config, "dataset", None) is not None task = infer_task(self.args.task, self.args.model) - model = None if library_name == "diffusers" and quantize_with_dataset: if not is_diffusers_available(): @@ -306,6 +305,21 @@ def run(self): model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config) model.save_pretrained(self.args.output) + + if self.args.disable_convert_tokenizer: + return + + # avoid import when using other exporters (IPEX, INC) + from ...exporters.openvino.convert import export_tokenizer + + output = Path(self.args.output) + tokenizer = getattr(model, "tokenizer", None) + if tokenizer is not None: + export_tokenizer(tokenizer, output / "tokenizer") + + tokenizer_2 = getattr(model, "tokenizer_2", None) + if tokenizer_2 is not None: + export_tokenizer(tokenizer_2, output / "tokenizer_2") elif task.startswith("text-generation") and quantize_with_dataset: from optimum.intel import OVModelForCausalLM @@ -318,6 +332,8 @@ def run(self): trust_remote_code=self.args.trust_remote_code, ) model.save_pretrained(self.args.output) + + tokenizer = None try: from transformers import AutoTokenizer @@ -327,6 +343,13 @@ def run(self): tokenizer.save_pretrained(self.args.output) except: logger.warning("Could not save tokenizer") + + if tokenizer and not self.args.disable_convert_tokenizer: + from ...exporters.openvino.convert import export_tokenizer + + output = Path(self.args.output) + if tokenizer is not None: + export_tokenizer(tokenizer, output / "tokenizer") else: # TODO : add input shapes main_export( @@ -343,12 +366,3 @@ def run(self): library_name=library_name, # **input_shapes, ) - - if model and not self.args.disable_convert_tokenizer: - # avoid import when using other exporters (IPEX, INC) - from ...exporters.openvino.convert import export_tokenizer - - for tokenizer_name in ("tokenizer", "tokenizer_2"): - tokenizer = getattr(model, tokenizer_name, None) - if tokenizer is not None: - export_tokenizer(tokenizer, self.args.output / tokenizer_name) From a6005adededb41bd3bd028d03a5cbb66649140fb Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 4 Jun 2024 16:45:43 +0200 Subject: [PATCH 23/28] Remove unneccessary if --- optimum/commands/export/openvino.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 08d8d03e7a..7bda9cf7eb 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -348,8 +348,7 @@ def run(self): from ...exporters.openvino.convert import export_tokenizer output = Path(self.args.output) - if tokenizer is not None: - export_tokenizer(tokenizer, output / "tokenizer") + export_tokenizer(tokenizer, output / "tokenizer") else: # TODO : add input shapes main_export( From e3119169ed46196fe5af41541f928f4e32e96d12 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 4 Jun 2024 16:48:22 +0200 Subject: [PATCH 24/28] Remove extra variable --- optimum/commands/export/openvino.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 7bda9cf7eb..ff4d091faf 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -312,14 +312,13 @@ def run(self): # avoid import when using other exporters (IPEX, INC) from ...exporters.openvino.convert import export_tokenizer - output = Path(self.args.output) tokenizer = getattr(model, "tokenizer", None) if tokenizer is not None: - export_tokenizer(tokenizer, output / "tokenizer") + export_tokenizer(tokenizer, self.args.output / "tokenizer") tokenizer_2 = getattr(model, "tokenizer_2", None) if tokenizer_2 is not None: - export_tokenizer(tokenizer_2, output / "tokenizer_2") + export_tokenizer(tokenizer_2, self.args.output / "tokenizer_2") elif task.startswith("text-generation") and quantize_with_dataset: from optimum.intel import OVModelForCausalLM @@ -347,8 +346,7 @@ def run(self): if tokenizer and not self.args.disable_convert_tokenizer: from ...exporters.openvino.convert import export_tokenizer - output = Path(self.args.output) - export_tokenizer(tokenizer, output / "tokenizer") + export_tokenizer(tokenizer, self.args.output / "tokenizer") else: # TODO : add input shapes main_export( From fc4421482a083d5e0619bbeaaa18ae6093ecfac8 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 4 Jun 2024 16:52:17 +0200 Subject: [PATCH 25/28] ruff --- optimum/commands/export/openvino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index ff4d091faf..7b43608e74 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -340,7 +340,7 @@ def run(self): self.args.model, trust_remote_code=self.args.trust_remote_code ) tokenizer.save_pretrained(self.args.output) - except: + except Exception as e: logger.warning("Could not save tokenizer") if tokenizer and not self.args.disable_convert_tokenizer: From 709085be2fd8295dded155a8e97dc98ef6a57fe6 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 4 Jun 2024 17:03:39 +0200 Subject: [PATCH 26/28] Ruff 2 --- optimum/commands/export/openvino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 7b43608e74..21f41e8961 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -340,7 +340,7 @@ def run(self): self.args.model, trust_remote_code=self.args.trust_remote_code ) tokenizer.save_pretrained(self.args.output) - except Exception as e: + except Exception: logger.warning("Could not save tokenizer") if tokenizer and not self.args.disable_convert_tokenizer: From a2084d939928b8252feb3c09a9ba6b50b239be6f Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 5 Jun 2024 13:52:34 +0200 Subject: [PATCH 27/28] Introduce a separate function to tokenizer conversion --- optimum/commands/export/openvino.py | 41 +++++-------------- optimum/exporters/openvino/__main__.py | 56 +++++++++++++++----------- 2 files changed, 43 insertions(+), 54 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 21f41e8961..631e30c5af 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -19,9 +19,11 @@ from typing import TYPE_CHECKING, Optional from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from transformers.utils.quantization_config import QuantizationMethod from ...exporters import TasksManager from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available +from ...utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors from ..base import BaseOptimumCLICommand, CommandInfo @@ -207,7 +209,7 @@ def parse_args(parser: "ArgumentParser"): return parse_args_openvino(parser) def run(self): - from ...exporters.openvino.__main__ import infer_task, main_export + from ...exporters.openvino.__main__ import infer_task, main_export, maybe_convert_tokenizers from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig if self.args.fp16: @@ -251,7 +253,7 @@ def run(self): "all_layers": None if is_int8 else self.args.all_layers, "dataset": self.args.dataset, "num_samples": self.args.num_samples, - "quant_method": "awq" if self.args.awq else None, + "quant_method": QuantizationMethod.AWQ if self.args.awq else None, "sensitivity_metric": self.args.sensitivity_metric, } @@ -305,20 +307,8 @@ def run(self): model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config) model.save_pretrained(self.args.output) - - if self.args.disable_convert_tokenizer: - return - - # avoid import when using other exporters (IPEX, INC) - from ...exporters.openvino.convert import export_tokenizer - - tokenizer = getattr(model, "tokenizer", None) - if tokenizer is not None: - export_tokenizer(tokenizer, self.args.output / "tokenizer") - - tokenizer_2 = getattr(model, "tokenizer_2", None) - if tokenizer_2 is not None: - export_tokenizer(tokenizer_2, self.args.output / "tokenizer_2") + if not self.args.disable_convert_tokenizer: + maybe_convert_tokenizers(library_name, self.args.output, model) elif task.startswith("text-generation") and quantize_with_dataset: from optimum.intel import OVModelForCausalLM @@ -332,21 +322,10 @@ def run(self): ) model.save_pretrained(self.args.output) - tokenizer = None - try: - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained( - self.args.model, trust_remote_code=self.args.trust_remote_code - ) - tokenizer.save_pretrained(self.args.output) - except Exception: - logger.warning("Could not save tokenizer") - - if tokenizer and not self.args.disable_convert_tokenizer: - from ...exporters.openvino.convert import export_tokenizer - - export_tokenizer(tokenizer, self.args.output / "tokenizer") + maybe_save_preprocessors(self.args.model, self.args.output, trust_remote_code=self.args.trust_remote_code) + if not self.args.disable_convert_tokenizer: + preprocessors = maybe_load_preprocessors(self.args.model, trust_remote_code=self.args.trust_remote_code) + maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors) else: # TODO : add input shapes main_export( diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 1204c8d4cf..927c98ac37 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -364,17 +364,35 @@ class StoreAttr(object): **kwargs_shapes, ) - # hide openvino import when using other exporters - from optimum.exporters.openvino.convert import export_tokenizer + if convert_tokenizer: + maybe_convert_tokenizers(library_name, output, model, preprocessors) - if convert_tokenizer and is_openvino_tokenizers_available(): - if library_name != "diffusers": - tokenizer = next( - (preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)), - None, - ) + # Unpatch modules after GPTQ export + if do_gptq_patching: + torch.cuda.is_available = orig_cuda_check + GPTQQuantizer.post_init_model = orig_post_init_model + + +def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None): + """ + Tries to convert tokenizers to OV format and export them to disk. + + Arguments: + library_name (`str`): + The library name. + output (`Path`): + Path to save converted tokenizers to. + model (`PreTrainedModel`, *optional*, defaults to None): + Model instance. + preprocessors (`Iterable`, *optional*, defaults to None): + Iterable possibly containing tokenizers to be converted. + """ + from optimum.exporters.openvino.convert import export_tokenizer - if tokenizer is not None: + if is_openvino_tokenizers_available(): + if library_name != "diffusers" and preprocessors: + tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None) + if tokenizer: try: export_tokenizer(tokenizer, output) except Exception as exception: @@ -382,18 +400,10 @@ class StoreAttr(object): "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer " f"models won't be generated. Exception: {exception}" ) - else: - tokenizer = getattr(model, "tokenizer", None) - if tokenizer is not None: - export_tokenizer(tokenizer, output / "tokenizer") - - tokenizer_2 = getattr(model, "tokenizer_2", None) - if tokenizer_2 is not None: - export_tokenizer(tokenizer_2, output / "tokenizer_2") - elif convert_tokenizer and not is_openvino_tokenizers_available(): + elif model: + for tokenizer_name in ("tokenizer", "tokenizer_2"): + tokenizer = getattr(model, tokenizer_name, None) + if tokenizer: + export_tokenizer(tokenizer, output / tokenizer_name) + else: logger.warning("Tokenizer won't be converted.") - - # Unpatch modules after GPTQ export - if do_gptq_patching: - torch.cuda.is_available = orig_cuda_check - GPTQQuantizer.post_init_model = orig_post_init_model From e8cc0e9e84b80110e20989dcb114fd2522a10f04 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 5 Jun 2024 13:57:23 +0200 Subject: [PATCH 28/28] Black --- optimum/commands/export/openvino.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 631e30c5af..07e1dcffae 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -324,7 +324,9 @@ def run(self): maybe_save_preprocessors(self.args.model, self.args.output, trust_remote_code=self.args.trust_remote_code) if not self.args.disable_convert_tokenizer: - preprocessors = maybe_load_preprocessors(self.args.model, trust_remote_code=self.args.trust_remote_code) + preprocessors = maybe_load_preprocessors( + self.args.model, trust_remote_code=self.args.trust_remote_code + ) maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors) else: # TODO : add input shapes