Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bits and sym parameters to the OV quantization config #560

Merged
merged 13 commits into from
Feb 15, 2024
2 changes: 2 additions & 0 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,8 @@ def export_models(
Returns:
list of input_names and output_names from ONNX configuration
"""

# TODO : modify compression_option to quantization_config
outputs = []

if output_names is not None and len(output_names) != len(models_and_onnx_configs):
Expand Down
3 changes: 1 addition & 2 deletions optimum/intel/openvino/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,10 @@

patch_torch_operators()

from .configuration import OVConfig
from .configuration import OVConfig, OVWeightQuantizationConfig
from .quantization import OVQuantizer
from .trainer import OVTrainer
from .training_args import OVTrainingArguments
from .weight_quantization import OVWeightQuantizationConfig

from .modeling import (
OVModelForAudioClassification,
Expand Down
116 changes: 113 additions & 3 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List, Optional, Union
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Union

import torch
from transformers import PretrainedConfig
from transformers.utils.quantization_config import QuantizationConfigMixin

from optimum.configuration_utils import BaseConfig

from .weight_quantization import OVWeightQuantizationConfig


DEFAULT_QUANTIZATION_CONFIG = {
"algorithm": "quantization",
Expand Down Expand Up @@ -77,6 +77,28 @@
}


DEFAULT_4BIT_CONFIGS = {
"databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5},
"EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64},
"facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
"bigscience/bloomz-7b1": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.6},
"togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128},
"HuggingFaceH4/zephyr-7b-beta": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.6},
"meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
"meta-llama/Llama-2-7b-chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
"meta-llama/Llama-2-13b-chat": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
"stabilityai/stablelm-3b-4e1t": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
"stablelm-epoch-3b-preview": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
"stable-zephyr-3b-dpo": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
"pansophic/rocket-3B": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
"THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72},
"Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
"openlm-research/open_llama_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
"tiiuae/falcon-7b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
"psmathur/orca_mini_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
}


class OVConfig(BaseConfig):
CONFIG_NAME = "openvino_config.json"
FULL_CONFIGURATION_FILE = "openvino_config.json"
Expand Down Expand Up @@ -127,3 +149,91 @@ def _enable_standard_onnx_export_option(self):
for i, algo_config in enumerate(self.compression):
if algo_config["algorithm"] == "quantization":
self.compression[i]["export_to_onnx_standard_ops"] = self.save_onnx_model


@dataclass
class OVWeightQuantizationConfig(QuantizationConfigMixin):
"""
This is a wrapper class about all possible attributes and features that you can play with a model that has been
loaded using `optimum-intel` api for quantization with NNCF.

Args:

bits (`int`, defaults to 8):
The number of bits to quantize to.
sym (`bool`, *optional*, defaults to `False`):
Whether to use symetric quantization.
tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
The tokenizer used to process the dataset. You can pass either:
- A custom tokenizer object.
- A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
user or organization name, like `dbmdz/bert-base-german-cased`.
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
dataset (`Union[List[str]]`, *optional*):
The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the
the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new']
group_size (`int`, *optional*, defaults to 128):
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
ratio (`float`, *optional*, defaults to 1.0):
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
and the rest to INT8_ASYM).
all_layers (`bool`, *optional*):
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion.
sensitivity_metric (`nncf.SensitivityMetric`, *optional*):
The sensitivity metric for assigning quantization precision to layers. In order to
preserve the accuracy of the model, the more sensitive layers receives a higher precision.
awq (`bool`, *optional*):
Enables AWQ method to unify weight ranges and improve overall model accuracy.
ignored_scope (`nncf.IgnoredScope`, *optional*):
An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.

"""

def __init__(
self,
bits: int = 8,
sym: bool = False,
tokenizer: Any = None,
dataset: Optional[str] = None,
ratio: Optional[float] = None,
group_size: Optional[int] = None,
all_layers: Optional[bool] = None,
sensitivity_metric: Optional[str] = None,
ignored_scope: Optional[dict] = None,
**kwargs,
):
self.bits = bits
self.sym = sym
self.tokenizer = tokenizer
self.dataset = dataset
self.group_size = group_size
self.ratio = ratio
self.all_layers = all_layers
self.sensitivity_metric = sensitivity_metric
self.ignored_scope = ignored_scope
self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release
self.post_init()

def post_init(self):
r"""
Safety checker that arguments are correct
"""
if self.ratio is not None and not (0 <= self.ratio <= 1):
raise ValueError("damp_percent must between 0 and 1.")
if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
raise ValueError("group_size must be greater than 0 or equal to -1")
if self.dataset is not None and isinstance(self.dataset, str):
if self.dataset not in ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]:
raise ValueError(
f"""You have entered a string value for dataset. You can only choose between
['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
)

if self.bits not in [4, 8]:
raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")


def _check_default_4bit_configs(config: PretrainedConfig):
return DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None)
5 changes: 3 additions & 2 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@
from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful
from ...exporters.openvino.stateful import model_has_state
from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
from .configuration import OVWeightQuantizationConfig
from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
from .quantization import _int4_weight_only_quantization
from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
from .weight_quantization import OVWeightQuantizationConfig, compress_decoder_weights


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -594,7 +595,7 @@ def _from_pretrained(
causal_model = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)

if load_in_4bit:
compress_decoder_weights(causal_model, quantization_config)
_int4_weight_only_quantization(causal_model, quantization_config)
return causal_model


Expand Down
112 changes: 83 additions & 29 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@
import transformers
from accelerate.data_loader import DataLoaderStateMixin
from datasets import Dataset, load_dataset
from nncf import NNCFConfig
from nncf import CompressWeightsMode, IgnoredScope, NNCFConfig, SensitivityMetric
from nncf.torch import create_compressed_model, register_default_init_args, register_module
from nncf.torch.dynamic_graph.io_handling import wrap_nncf_model_inputs_with_objwalk
from nncf.torch.initialization import PTInitializingDataLoader
from openvino._offline_transformations import compress_quantize_weights_transformation
from openvino.runtime import Core, Tensor
from torch.utils._pytree import tree_map
from torch.utils.data import DataLoader, RandomSampler
from transformers import DataCollator, PreTrainedModel, default_data_collator
from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
from transformers.pytorch_utils import Conv1D

from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
Expand All @@ -44,19 +44,18 @@
from ...exporters.openvino.stateful import ensure_export_task_support_stateful, ensure_stateful_is_available
from ..utils.constant import _TASK_ALIASES
from ..utils.modeling_utils import get_model_device
from .configuration import OVConfig
from .configuration import OVConfig, OVWeightQuantizationConfig, _check_default_4bit_configs
from .modeling_base import OVBaseModel
from .modeling_decoder import OVBaseDecoderModel
from .utils import (
MAX_ONNX_OPSET,
MIN_ONNX_QDQ_OPSET,
ONNX_WEIGHTS_NAME,
OV_XML_FILE_NAME,
)
from .weight_quantization import OVWeightQuantizationConfig, compress_decoder_weights


COMPRESSION_OPTIONS = {
# TODO : remove as unused
_COMPRESSION_OPTIONS = {
"int8": {"mode": nncf.CompressWeightsMode.INT8},
"int4_sym_g128": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128},
"int4_asym_g128": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
Expand Down Expand Up @@ -234,27 +233,29 @@ def quantize(
)
ov_config = ov_config or quantization_config

if isinstance(self.model, OVBaseDecoderModel) and self.model.use_cache:
self._quantize_ovcausallm(
calibration_dataset,
save_directory,
batch_size,
data_collator,
remove_unused_columns,
weights_only,
ov_config,
**kwargs,
)
elif isinstance(self.model, OVBaseModel):
self._quantize_ovbasemodel(
calibration_dataset,
save_directory,
batch_size,
data_collator,
remove_unused_columns,
weights_only,
**kwargs,
)
if isinstance(self.model, OVBaseModel):
if self.model.export_feature == "text-generation" and self.model.use_cache:
self._quantize_ovcausallm(
calibration_dataset,
save_directory,
batch_size,
data_collator,
remove_unused_columns,
weights_only,
ov_config,
**kwargs,
)
else:
self._quantize_ovbasemodel(
calibration_dataset,
save_directory,
batch_size,
data_collator,
remove_unused_columns,
weights_only,
**kwargs,
)

elif isinstance(self.model, torch.nn.Module):
self._quantize_torchmodel(
calibration_dataset,
Expand All @@ -272,7 +273,7 @@ def quantize(
def _get_compression_options(self, config: OVConfig):
options = {}
if config is not None and "type" in config.compression:
options = COMPRESSION_OPTIONS[config.compression["type"]]
options = _COMPRESSION_OPTIONS[config.compression["type"]]
if "ratio" in config.compression:
options["ratio"] = config.compression["ratio"]
return options
Expand Down Expand Up @@ -334,7 +335,7 @@ def _quantize_ovcausallm(
quantization_config = OVWeightQuantizationConfig(mode=nncf.CompressWeightsMode.INT8_SYM)
self.model.model = nncf.compress_weights(self.model.model)
else:
compress_decoder_weights(self.model, quantization_config)
_int4_weight_only_quantization(self.model, quantization_config)

self.model.save_pretrained(save_directory)
return
Expand Down Expand Up @@ -579,3 +580,56 @@ def _get_calibration_dataloader(
def _remove_unused_columns(self, dataset: Dataset):
ignored_columns = list(set(dataset.column_names) - set(self._signature_columns))
return dataset.remove_columns(ignored_columns)


def _int4_weight_only_quantization(
model: OVBaseModel, quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None
):
if model.export_feature != "text-generation":
raise ValueError("Only `OVModelForCausalLM` are supported for now")

quantization_config = quantization_config or _check_default_4bit_configs(model.config)
ov_model = model.model

if quantization_config is not None:
config = quantization_config
if isinstance(config, Dict):
config = OVWeightQuantizationConfig.from_dict(quantization_config)

dataset = config.dataset

if config.dataset is not None and isinstance(config.dataset, str):
tokenizer = config.tokenizer
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(model.config.name_or_path)
elif isinstance(tokenizer, str):
tokenizer = AutoTokenizer.from_pretrained(tokenizer)

from optimum.gptq.data import get_dataset, prepare_dataset

dataset = get_dataset(config.dataset, tokenizer, seqlen=32)
dataset = prepare_dataset(dataset)
dataset = nncf.Dataset(dataset, lambda x: model.prepare_inputs(**x))

sensitivity_metric = None
if isinstance(config.sensitivity_metric, str):
sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper())

ignored_scope = None
if isinstance(config.ignored_scope, dict):
ignored_scope = IgnoredScope(**config.ignored_scope)

model.model = nncf.compress_weights(
ov_model,
mode=CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM,
ratio=config.ratio,
group_size=config.group_size,
all_layers=config.all_layers,
sensitivity_metric=sensitivity_metric,
# awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0
ignored_scope=ignored_scope,
dataset=dataset,
)
else:
# Data-free weight-only quantization to asymmetric INT4
model.model = nncf.compress_weights(ov_model, mode=nncf.CompressWeightsMode.INT4_ASYM)
Loading
Loading