Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 6768527

Browse files
committedFeb 1, 2024
Added awq option. Included NNCF package into openvino extra.
1 parent efe85a2 commit 6768527

File tree

2 files changed

+25
-21
lines changed

2 files changed

+25
-21
lines changed
 

‎optimum/intel/openvino/weight_quantization.py

+24-19
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,13 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
4545
ratio (`float`, *optional*, defaults to 1.0):
4646
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
4747
and the rest to INT8_ASYM).
48-
all_layers (`bool`, *optional*, defaults to False):
48+
all_layers (`bool`, *optional*):
4949
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion.
5050
sensitivity_metric (`nncf.SensitivityMetric`, *optional*):
5151
The sensitivity metric for assigning quantization precision to layers. In order to
5252
preserve the accuracy of the model, the more sensitive layers receives a higher precision.
53+
awq (`bool`, *optional*):
54+
Enables AWQ method to unify weight ranges and improve overall model accuracy.
5355
ignored_scope (`nncf.IgnoredScope`, *optional*):
5456
An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.
5557
@@ -62,9 +64,10 @@ def __init__(
6264
dataset: Optional[Union[nncf.Dataset, str]] = None,
6365
ratio: Optional[float] = None,
6466
group_size: Optional[int] = None,
65-
ignored_scope: Optional[nncf.IgnoredScope] = None,
6667
all_layers: Optional[bool] = None,
6768
sensitivity_metric: Optional[nncf.SensitivityMetric] = None,
69+
awq: Optional[bool] = None,
70+
ignored_scope: Optional[nncf.IgnoredScope] = None,
6871
**kwargs,
6972
):
7073
self.mode = mode
@@ -75,6 +78,7 @@ def __init__(
7578
self.ignored_scope = ignored_scope
7679
self.all_layers = all_layers
7780
self.sensitivity_metric = sensitivity_metric
81+
self.awq = awq
7882
self.post_init()
7983

8084
def post_init(self):
@@ -92,25 +96,25 @@ def post_init(self):
9296
['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
9397
)
9498

99+
DEFAULT_4BIT_CONFIGS = {
100+
"dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.5},
101+
"gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
102+
"opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
103+
"bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6},
104+
"red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
105+
"zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.6},
106+
"llama-2-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
107+
"llama-2-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
108+
"llama-2-13b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
109+
"stablelm-3b-4e1t": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
110+
"stablelm-epoch-3b-preview": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
111+
"stable-zephyr-3b-dpo": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
112+
"rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
113+
"chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72},
114+
"qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
115+
}
95116

96117
def _check_default_4bit_configs(config: PretrainedConfig):
97-
DEFAULT_4BIT_CONFIGS = {
98-
"dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.5},
99-
"gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
100-
"opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
101-
"bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6},
102-
"red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
103-
"zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.6},
104-
"llama-2-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
105-
"llama-2-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
106-
"llama-2-13b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
107-
"stablelm-3b-4e1t": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
108-
"stablelm-epoch-3b-preview": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8},
109-
"stable-zephyr-3b-dpo": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
110-
"rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
111-
"chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72},
112-
"qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
113-
}
114118
return DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None)
115119

116120

@@ -146,6 +150,7 @@ def compress_decoder_weights(model, quantization_config: Union[OVWeightQuantizat
146150
group_size=config.group_size,
147151
all_layers=config.all_layers,
148152
sensitivity_metric=config.sensitivity_metric,
153+
awq = config.awq,
149154
ignored_scope=config.ignored_scope,
150155
dataset=dataset,
151156
)

‎setup.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@
4444
"onnxruntime<1.15.0",
4545
"transformers>=4.34.0",
4646
],
47-
"openvino": ["openvino>=2023.2", "onnx", "onnxruntime", "transformers>=4.36.0", "optimum>=1.16.1"],
48-
"nncf": ["nncf>=2.7.0"],
47+
"openvino": ["openvino>=2023.2", "onnx", "onnxruntime", "transformers>=4.36.0", "optimum>=1.16.1", "nncf @ git+https://github.com/openvinotoolkit/nncf.git"],
4948
"ipex": ["intel-extension-for-pytorch", "onnx"],
5049
"diffusers": ["diffusers"],
5150
"quality": QUALITY_REQUIRE,

0 commit comments

Comments
 (0)
Please sign in to comment.