Skip to content

Commit 050bc9f

Browse files
committed
Fix config saving
1 parent 6dd2a90 commit 050bc9f

File tree

2 files changed

+57
-36
lines changed

2 files changed

+57
-36
lines changed

optimum/intel/openvino/configuration.py

+37-28
Original file line numberDiff line numberDiff line change
@@ -78,25 +78,26 @@
7878
}
7979

8080

81+
8182
DEFAULT_4BIT_CONFIGS = {
82-
"databricks/dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.5},
83-
"EleutherAI/gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
84-
"facebook/opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
85-
"bigscience/bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6},
86-
"togethercomputer/RedPajama-INCITE-7B-Instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
87-
"HuggingFaceH4/zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.6},
88-
"meta-llama/Llama-2-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
89-
"meta-llama/Llama-2-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
90-
"meta-llama/Llama-2-13b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
91-
"stabilityai/stablelm-3b-4e1t": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
92-
"stablelm-epoch-3b-preview": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
93-
"stable-zephyr-3b-dpo": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
94-
"pansophic/rocket-3B": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
95-
"THUDM/chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72},
96-
"Qwen/Qwen-7B-Chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
97-
"openlm-research/open_llama_3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
98-
"tiiuae/falcon-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
99-
"psmathur/orca_mini_3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
83+
"databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5},
84+
"EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64},
85+
"facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
86+
"bigscience/bloomz-7b1": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.6},
87+
"togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128},
88+
"HuggingFaceH4/zephyr-7b-beta": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.6},
89+
"meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
90+
"meta-llama/Llama-2-7b-chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
91+
"meta-llama/Llama-2-13b-chat": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
92+
"stabilityai/stablelm-3b-4e1t": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
93+
"stablelm-epoch-3b-preview": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
94+
"stable-zephyr-3b-dpo": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
95+
"pansophic/rocket-3B": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
96+
"THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72},
97+
"Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
98+
"openlm-research/open_llama_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
99+
"tiiuae/falcon-7b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
100+
"psmathur/orca_mini_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
100101
}
101102

102103

@@ -159,8 +160,11 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
159160
loaded using `optimum-intel` api for quantization with NNCF.
160161
161162
Args:
162-
mode (`nncf.CompressWeightsMode`, *optional*, defaults to INT8_ASYM):
163-
The model defines the weight compressoin method (4-bit, 8-bit, etc.) available in nncf.compress_weights nncf.CompressWeightsMode.
163+
164+
bits (`int`, defaults to 8):
165+
The number of bits to quantize to.
166+
sym (`bool`, *optional*, defaults to `False`):
167+
Whether to use symetric quantization.
164168
tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
165169
The tokenizer used to process the dataset. You can pass either:
166170
- A custom tokenizer object.
@@ -191,26 +195,27 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
191195

192196
def __init__(
193197
self,
194-
mode=None,
198+
bits: int = 8,
199+
sym: bool = False,
195200
tokenizer: Any = None,
196-
dataset: Optional[Union[nncf.Dataset, str]] = None,
201+
dataset: Optional[str] = None,
197202
ratio: Optional[float] = None,
198203
group_size: Optional[int] = None,
199204
all_layers: Optional[bool] = None,
200-
sensitivity_metric: Optional[nncf.SensitivityMetric] = None,
201-
awq: Optional[bool] = None,
202-
ignored_scope: Optional[nncf.IgnoredScope] = None,
205+
sensitivity_metric: Optional[str] = None,
206+
ignored_scope: Optional[dict] = None,
203207
**kwargs,
204208
):
205-
self.mode = mode
209+
self.bits = bits
210+
self.sym = sym
206211
self.tokenizer = tokenizer
207212
self.dataset = dataset
208213
self.group_size = group_size
209214
self.ratio = ratio
210-
self.ignored_scope = ignored_scope
211215
self.all_layers = all_layers
212216
self.sensitivity_metric = sensitivity_metric
213-
self.awq = awq
217+
self.ignored_scope = ignored_scope
218+
self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release
214219
self.post_init()
215220

216221
def post_init(self):
@@ -229,5 +234,9 @@ def post_init(self):
229234
)
230235

231236

237+
if self.bits not in [4, 8]:
238+
raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")
239+
240+
232241
def _check_default_4bit_configs(config: PretrainedConfig):
233242
return DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None)

optimum/intel/openvino/quantization.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,11 @@
2424
import transformers
2525
from accelerate.data_loader import DataLoaderStateMixin
2626
from datasets import Dataset, load_dataset
27-
from nncf import NNCFConfig
27+
from nncf import NNCFConfig, CompressWeightsMode, SensitivityMetric, IgnoredScope
2828
from nncf.torch import create_compressed_model, register_default_init_args, register_module
2929
from nncf.torch.dynamic_graph.io_handling import wrap_nncf_model_inputs_with_objwalk
3030
from nncf.torch.initialization import PTInitializingDataLoader
31+
3132
from openvino._offline_transformations import compress_quantize_weights_transformation
3233
from openvino.runtime import Core, Tensor
3334
from torch.utils._pytree import tree_map
@@ -54,7 +55,9 @@
5455
)
5556

5657

57-
COMPRESSION_OPTIONS = {
58+
59+
# TODO : remove as unused
60+
_COMPRESSION_OPTIONS = {
5861
"int8": {"mode": nncf.CompressWeightsMode.INT8},
5962
"int4_sym_g128": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128},
6063
"int4_asym_g128": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
@@ -272,12 +275,11 @@ def quantize(
272275
def _get_compression_options(self, config: OVConfig):
273276
options = {}
274277
if config is not None and "type" in config.compression:
275-
options = COMPRESSION_OPTIONS[config.compression["type"]]
278+
options = _COMPRESSION_OPTIONS[config.compression["type"]]
276279
if "ratio" in config.compression:
277280
options["ratio"] = config.compression["ratio"]
278281
return options
279282

280-
# TODO : add ov_config
281283
def _quantize_ovbasemodel(
282284
self,
283285
calibration_dataset: Dataset,
@@ -597,6 +599,7 @@ def _int4_weight_only_quantization(
597599
config = OVWeightQuantizationConfig.from_dict(quantization_config)
598600

599601
dataset = config.dataset
602+
600603
if config.dataset is not None and isinstance(config.dataset, str):
601604
tokenizer = config.tokenizer
602605
if tokenizer is None:
@@ -610,15 +613,24 @@ def _int4_weight_only_quantization(
610613
dataset = prepare_dataset(dataset)
611614
dataset = nncf.Dataset(dataset, lambda x: model.prepare_inputs(**x))
612615

616+
617+
sensitivity_metric = None
618+
if isinstance(config.sensitivity_metric, str):
619+
sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper())
620+
621+
ignored_scope = None
622+
if isinstance(config.ignored_scope, dict):
623+
ignored_scope = IgnoredScope(**config.ignored_scope)
624+
613625
model.model = nncf.compress_weights(
614626
ov_model,
615-
mode=config.mode,
627+
mode=CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM,
616628
ratio=config.ratio,
617629
group_size=config.group_size,
618630
all_layers=config.all_layers,
619-
sensitivity_metric=config.sensitivity_metric,
620-
awq=config.awq,
621-
ignored_scope=config.ignored_scope,
631+
sensitivity_metric=sensitivity_metric,
632+
# awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0
633+
ignored_scope=ignored_scope,
622634
dataset=dataset,
623635
)
624636
else:

0 commit comments

Comments
 (0)