Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dynamic quantization config #661

Merged
merged 27 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test_openvino.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
- name: Test with Pytest
run: |
pytest tests/openvino/ --ignore test_modeling_basic --durations=0
pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
- name: Test openvino-nightly
run: |
pip uninstall -y openvino
Expand Down
80 changes: 37 additions & 43 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def __init__(
self,
ignored_scope: Optional[dict] = None,
num_samples: Optional[int] = None,
weight_only: Optional[bool] = None,
**kwargs,
):
"""
Expand All @@ -72,14 +71,11 @@ def __init__(
entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class.
num_samples (`int`, *optional*):
The maximum number of samples composing the calibration dataset.
weight_only (`bool`, *optional*):
Used to explicitly specify type of quantization (weight-only of full) to apply.
"""
if isinstance(ignored_scope, nncf.IgnoredScope):
ignored_scope = ignored_scope.__dict__
self.ignored_scope = ignored_scope
self.num_samples = num_samples
self.weight_only = weight_only

def post_init(self):
try:
Expand Down Expand Up @@ -191,6 +187,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
Args:
bits (`int`, defaults to 8):
The number of bits to quantize to.
group_size (`int`, *optional*):
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
sym (`bool`, defaults to `False`):
Whether to use symmetric quantization.
tokenizer (`str`, *optional*):
Expand All @@ -209,8 +207,6 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
ratio (`float`, defaults to 1.0):
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
and the rest to INT8_ASYM).
group_size (`int`, *optional*):
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
all_layers (`bool`, *optional*):
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision.
sensitivity_metric (`str`, *optional*):
Expand All @@ -223,33 +219,24 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
The maximum number of samples composing the calibration dataset.
quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT):
Weight compression method to apply.
weight_only (`bool`, *optional*):
Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building
the config from dictionary.
"""

def __init__(
self,
bits: int = 8,
group_size: Optional[int] = None,
sym: bool = False,
tokenizer: Optional[str] = None,
dataset: Optional[Union[str, List[str]]] = None,
ratio: float = 1.0,
group_size: Optional[int] = None,
all_layers: Optional[bool] = None,
sensitivity_metric: Optional[str] = None,
ignored_scope: Optional[dict] = None,
num_samples: Optional[int] = None,
quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT,
weight_only: Optional[bool] = True,
**kwargs,
):
if weight_only is False:
logger.warning(
"Trying to create an instance of `OVWeightQuantizationConfig` with `weight_only` being "
"False. Please check your configuration."
)
super().__init__(ignored_scope, num_samples, True)
super().__init__(ignored_scope, num_samples)
self.bits = bits
self.sym = sym
self.tokenizer = tokenizer
Expand Down Expand Up @@ -305,83 +292,90 @@ def post_init(self):
raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}")


@dataclass
class OVDynamicQuantizationConfig(OVWeightQuantizationConfig):
def __init__(
self,
bits: int = 8,
weights_group_size: Optional[int] = None,
activations_group_size: int = 32,
**kwargs,
):
super().__init__(bits=bits, group_size=weights_group_size, **kwargs)
# TODO add kv_cache_dtype
self.activations_group_size = activations_group_size


@dataclass
class OVQuantizationConfig(OVQuantizationConfigBase):
def __init__(
self,
bits: int = 8,
sym: bool = False,
ignored_scope: Optional[dict] = None,
num_samples: Optional[int] = 300,
preset: nncf.QuantizationPreset = None,
model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER,
fast_bias_correction: bool = True,
overflow_fix: OverflowFix = OverflowFix.DISABLE,
weight_only: Optional[bool] = False,
**kwargs,
):
"""
Configuration class containing parameters related to model quantization with NNCF. Compared to weight
compression, during quantization both weights and activations are converted to lower precision.
For weight-only model quantization please see OVWeightQuantizationConfig.
Args:
bits (`int`, defaults to 8):
The number of bits to quantize to.
ignored_scope (`dict`, *optional*):
An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary
entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class.
num_samples (`int`, *optional*):
The maximum number of samples composing the calibration dataset.
preset (`nncf.QuantizationPreset`, *optional*):
A preset controls the quantization mode (symmetric and asymmetric).
It can take the following values:
- `performance`: Symmetric quantization of weights and activations.
- `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
Default value is None. In this case, `mixed` preset is used for `transformer`
model type otherwise `performance`.
sym (`bool`, defaults to `False`):
Whether to use symmetric quantization on the activations. Symmetric quantization will be applied on the weights in any case.
model_type (`nncf.ModelType`, defaults to nncf.ModelType.TRANSFORMER):
Model type is needed to specify additional patterns in the model. Supported only `transformer` now.
fast_bias_correction (`bool`, defaults to True):
Whether to apply fast or full bias correction algorithm.
overflow_fix (`nncf.OverflowFix`, default to OverflowFix.DISABLE):
Parameter for controlling overflow fix setting.
weight_only (`bool`, *optional*):
Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building
the config from dictionary.
"""
if weight_only is True:
logger.warning(
"Trying to create an instance of `OVQuantizationConfig` with `weight_only` being True. "
"Please check your configuration."
)
super().__init__(ignored_scope, num_samples, False)
super().__init__(ignored_scope, num_samples)
# TODO: remove checks below once NNCF is updated to 2.10
if isinstance(overflow_fix, str):
overflow_fix = OverflowFix(overflow_fix)
if isinstance(preset, str):
preset = nncf.QuantizationPreset(preset)

self.preset = preset
self.bits = bits
self.sym = sym
self.model_type = model_type
self.fast_bias_correction = fast_bias_correction
self.overflow_fix = overflow_fix
self.post_init()

def to_dict(self) -> Dict[str, Any]:
# TODO: remove code below once NNCF is updated to 2.10
if isinstance(self.overflow_fix, Enum) or isinstance(self.preset, Enum):
if isinstance(self.overflow_fix, Enum):
overflow_fix_value = (
None
if self.overflow_fix is None
else self.overflow_fix
if isinstance(self.overflow_fix, str)
else self.overflow_fix.value
)
preset_value = (
None if self.preset is None else self.preset if isinstance(self.preset, str) else self.preset.value
)
self_copy = copy.deepcopy(self)
self_copy.overflow_fix = overflow_fix_value
self_copy.preset = preset_value
return self_copy.to_dict()
return super().to_dict()

def post_init(self):
r"""
Safety checker that arguments are correct
"""
super().post_init()

if self.bits != 8:
raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")


def _check_default_4bit_configs(config: PretrainedConfig):
return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None)
21 changes: 12 additions & 9 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

from ...exporters.openvino import export, main_export
from ..utils.import_utils import is_nncf_available
from .configuration import OVConfig, OVWeightQuantizationConfig
from .configuration import OVConfig, OVDynamicQuantizationConfig, OVWeightQuantizationConfig
from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, _print_compiled_model_properties


Expand Down Expand Up @@ -64,10 +64,7 @@ def __init__(
self.model_save_dir = model_save_dir
self._device = device.upper()
self.is_dynamic = dynamic_shapes
self.ov_config = ov_config if ov_config is not None else {}
if self.ov_config.get("PERFORMANCE_HINT") is None:
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"

self.ov_config = {} if ov_config is None else {**ov_config}
self.preprocessors = kwargs.get("preprocessors", [])
enable_compilation = kwargs.get("compile", True)

Expand Down Expand Up @@ -98,12 +95,12 @@ def __init__(
self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)
self._set_ov_config_parameters()

@staticmethod
def load_model(
file_name: Union[str, Path],
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
calibration_dataset: Optional = None,
):
"""
Loads the model.
Expand All @@ -113,8 +110,6 @@ def load_model(
The path of the model ONNX or XML file.
quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*):
Quantization config to apply after model is loaded.
calibration_dataset (`nncf.Dataset`, *optional*):
Optional nncf.Dataset to feed to model weight compression when quantization config is provided.
"""

def fix_op_names_duplicates(model: openvino.runtime.Model):
Expand Down Expand Up @@ -143,7 +138,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):

from optimum.intel.openvino.quantization import _weight_only_quantization

model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset)
model = _weight_only_quantization(model, quantization_config)

return model

Expand Down Expand Up @@ -251,6 +246,14 @@ def _prepare_weight_quantization_config(

return quantization_config

def _set_ov_config_parameters(self):
if self.ov_config.get("PERFORMANCE_HINT") is None:
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"

q_config = self._openvino_config.quantization_config if self._openvino_config else None
if isinstance(q_config, OVDynamicQuantizationConfig):
self.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = str(q_config.activations_group_size)
Copy link
Collaborator

@AlexKoff88 AlexKoff88 Apr 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@echarlaix, shall we turn 8-bit KV-cache quantization as well? It is essentially a per-token INT8 quantization and it is safe in terms of accuracy degradation?


@staticmethod
def _cached_file(
model_path: Union[Path, str],
Expand Down
7 changes: 2 additions & 5 deletions optimum/intel/openvino/modeling_base_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,7 @@ def __init__(
self.model_save_dir = model_save_dir
self._device = device.upper()
self.is_dynamic = dynamic_shapes
self.ov_config = ov_config if ov_config is not None else {}

if self.ov_config.get("PERFORMANCE_HINT") is None:
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"

self.ov_config = {} if ov_config is None else {**ov_config}
self.preprocessors = kwargs.get("preprocessors", [])

if self.is_dynamic:
Expand All @@ -84,6 +80,7 @@ def __init__(
self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)
self._set_ov_config_parameters()

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Expand Down
13 changes: 4 additions & 9 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import logging
import os
from pathlib import Path
Expand Down Expand Up @@ -596,11 +595,10 @@ def _from_pretrained(
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

load_in_4bit = quantization_config.bits == 4 if quantization_config else False
calibration_dataset = kwargs.get("calibration_dataset", None)

Comment on lines -599 to +598
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed calibration_dataset argument @nikita-savelyevv

model = cls.load_model(
model_cache_path,
quantization_config=None if load_in_4bit else quantization_config,
calibration_dataset=calibration_dataset,
)

model_type = config.model_type.replace("_", "-")
Expand Down Expand Up @@ -637,18 +635,15 @@ def _from_pretrained(
f"For the given model, we recommend the following `quantization_config` : {default_config}"
)

if calibration_dataset is None and isinstance(quantization_config.dataset, str):
calibration_dataset = None
if isinstance(quantization_config.dataset, str):
tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id)

from optimum.gptq.data import get_dataset, prepare_dataset

# from optimum.gptq.utils import get_seqlen

# seqlen = get_seqlen(causal_model)
nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
nsamples = quantization_config.num_samples or 128
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
dataset = prepare_dataset(dataset)
quantization_config = copy.deepcopy(quantization_config)
calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))

_weight_only_quantization(model, quantization_config, calibration_dataset)
Expand Down
5 changes: 2 additions & 3 deletions optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ def __init__(
self._internal_dict = config
self._device = device.upper()
self.is_dynamic = dynamic_shapes
self.ov_config = ov_config if ov_config is not None else {}
if self.ov_config.get("PERFORMANCE_HINT") is None:
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
self.ov_config = {} if ov_config is None else {**ov_config}

# This attribute is needed to keep one reference on the temporary directory, since garbage collecting
# would end-up removing the directory containing the underlying OpenVINO model
Expand Down Expand Up @@ -162,6 +160,7 @@ def __init__(
self._openvino_config = None
if quantization_config:
self._openvino_config = OVConfig(quantization_config=quantization_config)
self._set_ov_config_parameters()

def _save_pretrained(self, save_directory: Union[str, Path]):
"""
Expand Down
24 changes: 8 additions & 16 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,22 +180,15 @@ def __init__(self, model: transformers.PreTrainedModel, task: Optional[str] = No
"""
super().__init__()
self.model = model
feature = kwargs.pop("feature", None)
if feature is not None:
logger.warning("`feature` is deprecated and will be removed in a future version. Use `task` instead.")
if task is not None and task != feature:
logger.warning(
f"Both `feature` and `task` were specified. {task} will be used to define the model topology for the model ONNX export."
)
self.task = task or feature
self.task = task
self.seed = seed
# TODO : deprecate input_names
self.input_names = None
signature = inspect.signature(self.model.forward)
self._signature_columns = list(signature.parameters.keys())
self._export_input_names = [
column for column in self._signature_columns if column not in {"label", "labels", "label_ids"}
]

@property
def input_names(self):
logger.warning("The`input_names` attribute is deprecated and will be removed in v1.18.0")
return None

@classmethod
def from_pretrained(cls, model: PreTrainedModel, **kwargs):
Expand Down Expand Up @@ -265,9 +258,8 @@ def quantize(
# TODO: deprecate weights_only argument
if weights_only is not None:
logger.warning(
"`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` "
"as an instance of OVWeightQuantizationConfig for weight-only compression or as an instance of "
"OVQuantizationConfig for full model quantization."
"`weights_only` argument is deprecated and will be removed in v1.18.0. In the future please provide `ov_config.quantization_config` "
"as an instance of `OVWeightQuantizationConfig` for weight-only compression or as an instance of `OVQuantizationConfig` for full model quantization."
)

if save_directory is None:
Expand Down
Loading
Loading