Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dynamic quantization config #661

Merged
merged 27 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 122 additions & 114 deletions optimum/intel/openvino/configuration.py

Large diffs are not rendered by default.

21 changes: 12 additions & 9 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

from ...exporters.openvino import export, main_export
from ..utils.import_utils import is_nncf_available
from .configuration import OVConfig, OVWeightQuantizationConfig
from .configuration import OVConfig, OVDynamicQuantizationConfig, OVWeightQuantizationConfig
from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, _print_compiled_model_properties


Expand Down Expand Up @@ -64,10 +64,7 @@ def __init__(
self.model_save_dir = model_save_dir
self._device = device.upper()
self.is_dynamic = dynamic_shapes
self.ov_config = ov_config if ov_config is not None else {}
if self.ov_config.get("PERFORMANCE_HINT") is None:
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"

self.ov_config = {} if ov_config is None else {**ov_config}
self.preprocessors = kwargs.get("preprocessors", [])
enable_compilation = kwargs.get("compile", True)

Expand Down Expand Up @@ -98,12 +95,12 @@ def __init__(
self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)
self._set_ov_config_parameters()

@staticmethod
def load_model(
file_name: Union[str, Path],
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
calibration_dataset: Optional = None,
):
"""
Loads the model.
Expand All @@ -113,8 +110,6 @@ def load_model(
The path of the model ONNX or XML file.
quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*):
Quantization config to apply after model is loaded.
calibration_dataset (`nncf.Dataset`, *optional*):
Optional nncf.Dataset to feed to model weight compression when quantization config is provided.
"""

def fix_op_names_duplicates(model: openvino.runtime.Model):
Expand Down Expand Up @@ -143,7 +138,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):

from optimum.intel.openvino.quantization import _weight_only_quantization

model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset)
model = _weight_only_quantization(model, quantization_config)

return model

Expand Down Expand Up @@ -251,6 +246,14 @@ def _prepare_weight_quantization_config(

return quantization_config

def _set_ov_config_parameters(self):
if self.ov_config.get("PERFORMANCE_HINT") is None:
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"

q_config = self._openvino_config.quantization_config if self._openvino_config else None
if isinstance(q_config, OVDynamicQuantizationConfig):
self.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = str(q_config.activations_group_size)
Copy link
Collaborator

@AlexKoff88 AlexKoff88 Apr 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@echarlaix, shall we turn 8-bit KV-cache quantization as well? It is essentially a per-token INT8 quantization and it is safe in terms of accuracy degradation?


@staticmethod
def _cached_file(
model_path: Union[Path, str],
Expand Down
7 changes: 2 additions & 5 deletions optimum/intel/openvino/modeling_base_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,7 @@ def __init__(
self.model_save_dir = model_save_dir
self._device = device.upper()
self.is_dynamic = dynamic_shapes
self.ov_config = ov_config if ov_config is not None else {}

if self.ov_config.get("PERFORMANCE_HINT") is None:
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"

self.ov_config = {} if ov_config is None else {**ov_config}
self.preprocessors = kwargs.get("preprocessors", [])

if self.is_dynamic:
Expand All @@ -84,6 +80,7 @@ def __init__(
self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)
self._set_ov_config_parameters()

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Expand Down
13 changes: 4 additions & 9 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import logging
import os
from pathlib import Path
Expand Down Expand Up @@ -596,11 +595,10 @@ def _from_pretrained(
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

load_in_4bit = quantization_config.bits == 4 if quantization_config else False
calibration_dataset = kwargs.get("calibration_dataset", None)

Comment on lines -599 to +598
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed calibration_dataset argument @nikita-savelyevv

model = cls.load_model(
model_cache_path,
quantization_config=None if load_in_4bit else quantization_config,
calibration_dataset=calibration_dataset,
)

model_type = config.model_type.replace("_", "-")
Expand Down Expand Up @@ -637,18 +635,15 @@ def _from_pretrained(
f"For the given model, we recommend the following `quantization_config` : {default_config}"
)

if calibration_dataset is None and isinstance(quantization_config.dataset, str):
calibration_dataset = None
if isinstance(quantization_config.dataset, str):
tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id)

from optimum.gptq.data import get_dataset, prepare_dataset

# from optimum.gptq.utils import get_seqlen

# seqlen = get_seqlen(causal_model)
nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
nsamples = quantization_config.num_samples or 128
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
dataset = prepare_dataset(dataset)
quantization_config = copy.deepcopy(quantization_config)
calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))

_weight_only_quantization(model, quantization_config, calibration_dataset)
Expand Down
5 changes: 2 additions & 3 deletions optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ def __init__(
self._internal_dict = config
self._device = device.upper()
self.is_dynamic = dynamic_shapes
self.ov_config = ov_config if ov_config is not None else {}
if self.ov_config.get("PERFORMANCE_HINT") is None:
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
self.ov_config = {} if ov_config is None else {**ov_config}

# This attribute is needed to keep one reference on the temporary directory, since garbage collecting
# would end-up removing the directory containing the underlying OpenVINO model
Expand Down Expand Up @@ -162,6 +160,7 @@ def __init__(
self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)
self._set_ov_config_parameters()

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Expand Down
24 changes: 8 additions & 16 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,22 +180,15 @@ def __init__(self, model: transformers.PreTrainedModel, task: Optional[str] = No
"""
super().__init__()
self.model = model
feature = kwargs.pop("feature", None)
if feature is not None:
logger.warning("`feature` is deprecated and will be removed in a future version. Use `task` instead.")
if task is not None and task != feature:
logger.warning(
f"Both `feature` and `task` were specified. {task} will be used to define the model topology for the model ONNX export."
)
self.task = task or feature
self.task = task
self.seed = seed
# TODO : deprecate input_names
self.input_names = None
signature = inspect.signature(self.model.forward)
self._signature_columns = list(signature.parameters.keys())
self._export_input_names = [
column for column in self._signature_columns if column not in {"label", "labels", "label_ids"}
]

@property
def input_names(self):
logger.warning("The`input_names` attribute is deprecated and will be removed in v1.18.0")
return None

@classmethod
def from_pretrained(cls, model: PreTrainedModel, **kwargs):
Expand Down Expand Up @@ -265,9 +258,8 @@ def quantize(
# TODO: deprecate weights_only argument
if weights_only is not None:
logger.warning(
"`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` "
"as an instance of OVWeightQuantizationConfig for weight-only compression or as an instance of "
"OVQuantizationConfig for full model quantization."
"`weights_only` argument is deprecated and will be removed in v1.18.0. In the future please provide `ov_config.quantization_config` "
"as an instance of `OVWeightQuantizationConfig` for weight-only compression or as an instance of `OVQuantizationConfig` for full model quantization."
)

if save_directory is None:
Expand Down
9 changes: 1 addition & 8 deletions optimum/intel/openvino/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,6 @@ def __init__(
preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None,
ov_config: Optional[OVConfig] = None,
task: Optional[str] = None,
feature: Optional[str] = None,
):
self.neftune_noise_alpha = None

Expand All @@ -233,13 +232,7 @@ def __init__(
)

self.ov_config = ov_config
if feature is not None:
logger.warning("`feature` is deprecated and will be removed in a future version. Use `task` instead.")
if task is not None and task != feature:
logger.warning(
f"Both `feature` and `task` were specified. {task} will be used to define the model topology for the model ONNX export."
)
self.task = task or feature
self.task = task
self.teacher = None
if teacher_model is not None:
self.teacher = teacher_model.to(args.device)
Expand Down
Loading
Loading