Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Save an openvino config summarizing all information related to quantization when saving model #578

Merged
merged 18 commits into from
Mar 1, 2024
Merged
3 changes: 0 additions & 3 deletions docs/source/optimization_ov.mdx
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do I understand correctly that with these changes you need to add a quantization_config to .from_pretrained when loading a model, but that doesn't actually do anything until you call quantizer.quantize?
What is the reason for that change? If the motivation is to save the config, it seems we could just update that after quantizing the model?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no modification of the quantization process in this PR (you can still apply quantization by providing the quantization_config or load_in_8bit arguments when creating an instance of OVModel), the main addition is that we save the configuration containing all the information related to quantization when saving the model with model.save_pretrained(output_dir). I also removed the default compression value that was default when creating an OVConfig instance as it's not used (we should start moving all quantization parameters in quantization_config in the future).

Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ save_dir = "ptq_model"
def preprocess_function(examples, tokenizer):
return tokenizer(examples["sentence"], padding="max_length", max_length=128, truncation=True)

# Load the default quantization configuration detailing the quantization we wish to apply
quantization_config = OVConfig()
Comment on lines -41 to -42
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# Instantiate our OVQuantizer using the desired configuration
quantizer = OVQuantizer.from_pretrained(model)
# Create the calibration dataset used to perform static quantization
Expand All @@ -52,7 +50,6 @@ calibration_dataset = quantizer.get_calibration_dataset(
)
# Apply static quantization and export the resulting quantized model to OpenVINO IR format
quantizer.quantize(
quantization_config=quantization_config,
calibration_dataset=calibration_dataset,
save_directory=save_dir,
)
Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def __init__(
**kwargs,
):
super().__init__()
self.compression = compression or DEFAULT_QUANTIZATION_CONFIG
self.compression = compression
self.input_info = input_info
self.save_onnx_model = save_onnx_model
self._enable_standard_onnx_export_option()
Expand Down
38 changes: 34 additions & 4 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
dynamic_shapes: bool = True,
ov_config: Optional[Dict[str, str]] = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
**kwargs,
):
self.config = config
Expand Down Expand Up @@ -91,6 +92,10 @@ def __init__(

self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None

self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)

@staticmethod
def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None):
"""
Expand Down Expand Up @@ -143,6 +148,15 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
dst_path = os.path.join(save_directory, OV_XML_FILE_NAME)
openvino.save_model(self.model, dst_path, compress_to_fp16=False)

self._save_openvino_config(save_directory)

def _save_openvino_config(self, save_directory: Union[str, Path]):
if self._openvino_config is not None:
if not isinstance(self._openvino_config.quantization_config.dataset, (str, type(None))):
self._openvino_config.quantization_config.dataset = None

self._openvino_config.save_pretrained(save_directory)

@classmethod
def _from_pretrained(
cls,
Expand Down Expand Up @@ -203,12 +217,28 @@ def _from_pretrained(
local_files_only=local_files_only,
)

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

model = cls.load_model(model_cache_path, quantization_config=quantization_config)
return cls(model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
return cls(
model,
config=config,
model_save_dir=model_cache_path.parent,
quantization_config=quantization_config,
**kwargs,
)

@staticmethod
def _prepare_weight_quantization_config(
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, load_in_8bit: bool = False
):
# Give default quantization config if not provided and load_in_8bit=True
if not quantization_config and load_in_8bit:
quantization_config = OVWeightQuantizationConfig(bits=8)
elif isinstance(quantization_config, dict):
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)

return quantization_config

@staticmethod
def _cached_file(
Expand Down
11 changes: 8 additions & 3 deletions optimum/intel/openvino/modeling_base_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(
dynamic_shapes: bool = True,
ov_config: Optional[Dict[str, str]] = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
**kwargs,
):
self.config = config
Expand All @@ -76,6 +77,9 @@ def __init__(
self.decoder_model = decoder
self.decoder_with_past_model = decoder_with_past
self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Expand All @@ -96,6 +100,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
dst_path = os.path.join(save_directory, dst_file_name)
openvino.save_model(src_file, dst_path, compress_to_fp16=False)

self._save_openvino_config(save_directory)

@classmethod
def _from_pretrained(
cls,
Expand Down Expand Up @@ -155,9 +161,7 @@ def _from_pretrained(
decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name
decoder_with_past = None

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

# Load model from a local directory
if os.path.isdir(model_id):
Expand Down Expand Up @@ -205,6 +209,7 @@ def _from_pretrained(
decoder_with_past=decoder_with_past,
config=config,
model_save_dir=model_save_dir,
quantization_config=quantization_config,
**kwargs,
)

Expand Down
24 changes: 15 additions & 9 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import logging
import os
from pathlib import Path
Expand Down Expand Up @@ -100,6 +101,7 @@ def __init__(
dynamic_shapes: bool = True,
ov_config: Optional[Dict[str, str]] = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
**kwargs,
):
if not dynamic_shapes:
Expand All @@ -117,6 +119,7 @@ def __init__(
dynamic_shapes=False,
ov_config=ov_config,
model_save_dir=model_save_dir,
quantization_config=quantization_config,
**kwargs,
)

Expand Down Expand Up @@ -224,6 +227,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
dst_path = os.path.join(save_directory, OV_XML_FILE_NAME)
openvino.save_model(model_to_save, dst_path, compress_to_fp16=False)

self._save_openvino_config(save_directory)

@classmethod
def _from_transformers(
cls,
Expand Down Expand Up @@ -576,15 +581,10 @@ def _from_pretrained(
local_files_only=local_files_only,
)

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}

if isinstance(quantization_config, dict):
if quantization_config == {"bits": 4} and config.name_or_path in _DEFAULT_4BIT_CONFIGS:
quantization_config = _DEFAULT_4BIT_CONFIGS[config.name_or_path]
if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}:
quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config)

quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

load_in_4bit = quantization_config.bits == 4 if quantization_config else False
model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config)
Expand All @@ -603,7 +603,12 @@ def _from_pretrained(

enable_compilation = kwargs.pop("compile", True) and not load_in_4bit
causal_model = init_cls(
model=model, config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, **kwargs
model=model,
config=config,
model_save_dir=model_cache_path.parent,
compile=enable_compilation,
quantization_config=quantization_config,
**kwargs,
)

if load_in_4bit:
Expand Down Expand Up @@ -632,6 +637,7 @@ def _from_pretrained(
# seqlen = get_seqlen(causal_model)
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32)
dataset = prepare_dataset(dataset)
quantization_config = copy.deepcopy(quantization_config)
quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))

_weight_only_quantization(model, quantization_config)
Expand Down
21 changes: 16 additions & 5 deletions optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def __init__(
compile: bool = True,
ov_config: Optional[Dict[str, str]] = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
**kwargs,
):
self._internal_dict = config
Expand Down Expand Up @@ -140,6 +141,10 @@ def __init__(

self._internal_dict.pop("vae", None)

self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Saves the model to the OpenVINO IR format so that it can be re-loaded using the
Expand Down Expand Up @@ -177,6 +182,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
if self.tokenizer_2 is not None:
self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2")

self._save_openvino_config(save_directory)

@classmethod
def _from_pretrained(
cls,
Expand Down Expand Up @@ -257,10 +264,7 @@ def _from_pretrained(
else:
kwargs[name] = load_method(new_model_save_dir)

# Give default quantization config if not provided and load_in_8bit=True
if load_in_8bit:
quantization_config = quantization_config or {"bits": 8}

quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
unet = cls.load_model(
new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, quantization_config
)
Expand All @@ -278,7 +282,14 @@ def _from_pretrained(
if model_save_dir is None:
model_save_dir = new_model_save_dir

return cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
return cls(
unet=unet,
config=config,
model_save_dir=model_save_dir,
quantization_config=quantization_config,
**components,
**kwargs,
)

@classmethod
def _from_transformers(
Expand Down
11 changes: 7 additions & 4 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
from ..utils.constant import _TASK_ALIASES
from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available
from ..utils.modeling_utils import get_model_device
from .configuration import OVConfig, OVWeightQuantizationConfig
from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig, OVWeightQuantizationConfig
from .modeling_base import OVBaseModel
from .utils import (
MAX_ONNX_OPSET,
Expand Down Expand Up @@ -235,8 +235,11 @@ def quantize(
)
ov_config = ov_config or quantization_config

if ov_config is not None and not isinstance(ov_config, OVConfig):
raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.")
if ov_config is not None:
if not isinstance(ov_config, OVConfig):
raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.")
elif ov_config.compression is None:
ov_config.compression = DEFAULT_QUANTIZATION_CONFIG

if isinstance(self.model, OVBaseModel):
self._quantize_ovbasemodel(
Expand Down Expand Up @@ -355,7 +358,7 @@ def _quantize_torchmodel(
logger.info(
"No configuration describing the quantization process was provided, a default OVConfig will be generated."
)
ov_config = OVConfig()
ov_config = OVConfig(compression=DEFAULT_QUANTIZATION_CONFIG)
onnx_file_name = (
ONNX_WEIGHTS_NAME
if file_name is None and ov_config.save_onnx_model
Expand Down
66 changes: 35 additions & 31 deletions optimum/intel/openvino/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@

from ..utils.constant import _TASK_ALIASES
from ..utils.import_utils import is_transformers_version
from .configuration import OVConfig
from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig
from .quantization import OVDataLoader
from .training_args import OVTrainingArguments
from .utils import (
Expand Down Expand Up @@ -225,37 +225,41 @@ def __init__(
self.teacher.eval()
self.compression_controller = None

if self.ov_config is not None and self.args.do_train:
self._set_task()
train_dataloader = self.get_train_dataloader()
model_inputs = next(iter(train_dataloader))
for label_name in self.label_names:
model_inputs.pop(label_name)
force_batch_one = self._is_pruning_enabled()
self.ov_config.add_input_info(model_inputs, force_batch_one)
nncf_config = NNCFConfig.from_dict(self.ov_config.__dict__)
nncf_config.register_extra_structs(
[
QuantizationRangeInitArgs(OVDataLoader(train_dataloader)),
BNAdaptationInitArgs(OVDataLoader(train_dataloader)),
]
)
if self.ov_config is not None:
if self.ov_config.compression is None:
self.ov_config.compression = DEFAULT_QUANTIZATION_CONFIG

if self.args.do_train:
self._set_task()
train_dataloader = self.get_train_dataloader()
model_inputs = next(iter(train_dataloader))
for label_name in self.label_names:
model_inputs.pop(label_name)
force_batch_one = self._is_pruning_enabled()
self.ov_config.add_input_info(model_inputs, force_batch_one)
nncf_config = NNCFConfig.from_dict(self.ov_config.__dict__)
nncf_config.register_extra_structs(
[
QuantizationRangeInitArgs(OVDataLoader(train_dataloader)),
BNAdaptationInitArgs(OVDataLoader(train_dataloader)),
]
)

# Configure NNCF logging
# Disable nncf logging to stdout except error
# but to file nncf_output.log
nncf_config["log_dir"] = args.output_dir
nncf_log_file_handler = logging.logging.FileHandler(os.path.join(args.output_dir, NNCF_LOG_FILE_NAME))
nncf_log_file_handler.setFormatter(logging.logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
nncf_logger.addHandler(nncf_log_file_handler)
set_log_level(logging.ERROR)
nncf_logger.setLevel(logging.INFO)
nncf_log_file_handler.setLevel(logging.INFO)

self.compression_controller, self.model = create_compressed_model(self.model, nncf_config)
self.model_wrapped = self.model
# TODO : To deprecate once support transformers > 4.30.0
self.deepspeed = None
# Configure NNCF logging
# Disable nncf logging to stdout except error
# but to file nncf_output.log
nncf_config["log_dir"] = args.output_dir
nncf_log_file_handler = logging.logging.FileHandler(os.path.join(args.output_dir, NNCF_LOG_FILE_NAME))
nncf_log_file_handler.setFormatter(logging.logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
nncf_logger.addHandler(nncf_log_file_handler)
set_log_level(logging.ERROR)
nncf_logger.setLevel(logging.INFO)
nncf_log_file_handler.setLevel(logging.INFO)

self.compression_controller, self.model = create_compressed_model(self.model, nncf_config)
self.model_wrapped = self.model
# TODO : To deprecate once support transformers > 4.30.0
self.deepspeed = None

def _set_signature_columns_if_needed(self):
if self._signature_columns is None:
Expand Down
Loading
Loading