Initial commit

nikita-savelyevv · nikita-savelyevv · commit 3ffacd3af7da · 2024-08-22T18:08:34.000+02:00
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -14,7 +14,9 @@
 
 import gc
 import logging
+import operator
 import warnings
+from functools import reduce
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
@@ -23,19 +25,19 @@
 from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
 from transformers.utils import is_torch_available
 
+from openvino.runtime import Core, Type, save_model
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
 from optimum.exporters.openvino.convert import export_from_model
 from optimum.intel.utils.import_utils import (
     is_openvino_tokenizers_available,
     is_openvino_version,
-    is_transformers_version,
+    is_transformers_version, is_nncf_available,
 )
 from optimum.utils.save_utils import maybe_load_preprocessors
 
-from .utils import clear_class_registry
-
+from .utils import clear_class_registry, _MAX_UNCOMPRESSED_SIZE
 
 if TYPE_CHECKING:
     from optimum.intel.openvino.configuration import OVConfig
@@ -402,7 +404,7 @@ class StoreAttr(object):
         model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
     )
 
-    export_from_model(
+    submodel_paths = export_from_model(
         model=model,
         output=output,
         task=task,
@@ -425,6 +427,48 @@ class StoreAttr(object):
     del model
     gc.collect()
 
+    core = Core()
+    compressed_submodel_paths = []
+    for submodel_path in submodel_paths:
+        submodel_path = Path(output) / submodel_path
+        submodel = core.read_model(submodel_path)
+
+        quantization_config = ov_config.quantization_config if ov_config is not None else None
+        if ov_config is None:
+            num_parameters = 0
+            for op in submodel.get_ops():
+                if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]:
+                    num_parameters += reduce(operator.mul, op.shape, 1)
+            if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
+                if is_nncf_available():
+                    quantization_config = {"bits": 8, "sym": False}
+                    logger.info("The model weights will be quantized to int8_asym.")
+                else:
+                    continue
+        if not quantization_config:
+            continue
+
+        if not is_nncf_available():
+            raise ImportError(
+                "Quantization of the weights requires nncf, please install it with `pip install nncf`"
+            )
+
+        from optimum.intel.openvino.quantization import _weight_only_quantization
+
+        _weight_only_quantization(submodel, quantization_config)
+
+        compressed_submodel_path = Path(str(submodel_path).replace(".xml", "_compressed.xml"))
+        save_model(submodel, compressed_submodel_path, compress_to_fp16=ov_config and ov_config.dtype == "fp16")
+        compressed_submodel_paths.append((submodel_path, compressed_submodel_path))
+
+        del submodel
+
+    for submodel_path, compressed_submodel_path in compressed_submodel_paths:
+        submodel_path.unlink()
+        submodel_path.with_suffix(".bin").unlink()
+        compressed_submodel_path.rename(submodel_path)
+        compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin"))
+
     # Unpatch modules after GPTQ export
     if do_gptq_patching:
         torch.cuda.is_available = orig_cuda_check
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -49,7 +49,6 @@
 from .model_patcher import patch_model_with_bettertransformer
 from .stateful import ensure_export_task_support_stateful, ensure_stateful_is_available, patch_stateful
 from .utils import (
-    _MAX_UNCOMPRESSED_SIZE,
     OV_XML_FILE_NAME,
     clear_class_registry,
     flattenize_inputs,
@@ -76,21 +75,7 @@
 
 
 def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None, library_name: Optional[str] = None):
-    compress_to_fp16 = False
-
-    if ov_config is not None:
-        if ov_config.quantization_config:
-            if not is_nncf_available():
-                raise ImportError(
-                    "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`"
-                )
-
-            from optimum.intel.openvino.quantization import _weight_only_quantization
-
-            _weight_only_quantization(model, ov_config.quantization_config)
-
-        compress_to_fp16 = ov_config.dtype == "fp16"
-
+    compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
     model = _add_version_info_to_model(model, library_name)
     save_model(model, path, compress_to_fp16)
 
@@ -643,25 +628,6 @@ def export_from_model(
     )
     logging.disable(logging.NOTSET)
 
-    if ov_config is None:
-        if library_name == "diffusers":
-            num_parameters = model.unet.num_parameters()
-        else:
-            num_parameters = sum(param.numel() for param in list(model.parameters()) if param.requires_grad)
-
-        if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
-            if is_nncf_available():
-                from ...intel.openvino.configuration import OVConfig
-
-                ov_config = OVConfig(quantization_config={"bits": 8, "sym": False})
-
-                logger.info("The model weights will be quantized to int8_asym.")
-            else:
-                logger.warning(
-                    "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."
-                    "please install it with `pip install nncf`"
-                )
-
     if library_name != "diffusers":
         # Saving the model config and preprocessor as this is needed sometimes.
         model.config.save_pretrained(output)
@@ -720,6 +686,8 @@ def export_from_model(
         patch_16bit_model=patch_16bit_model,
     )
 
+    return files_subpaths
+
 
 def export_tokenizer(
     tokenizer,