78
78
}
79
79
80
80
81
+
81
82
DEFAULT_4BIT_CONFIGS = {
82
- "databricks/dolly-v2-3b" : {"mode " : nncf . CompressWeightsMode . INT4_ASYM , "group_size" : 32 , "ratio" : 0.5 },
83
- "EleutherAI/gpt-j-6b" : {"mode " : nncf . CompressWeightsMode . INT4_ASYM , "group_size" : 64 },
84
- "facebook/opt-6.7b" : {"mode " : nncf . CompressWeightsMode . INT4_ASYM , "group_size" : 64 , "ratio" : 0.8 },
85
- "bigscience/bloomz-7b1" : {"mode " : nncf . CompressWeightsMode . INT4_ASYM , "group_size" : 32 , "ratio" : 0.6 },
86
- "togethercomputer/RedPajama-INCITE-7B-Instruct" : {"mode " : nncf . CompressWeightsMode . INT4_ASYM , "group_size" : 128 },
87
- "HuggingFaceH4/zephyr-7b-beta" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 64 , "ratio" : 0.6 },
88
- "meta-llama/Llama-2-7b" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 128 , "ratio" : 0.6 },
89
- "meta-llama/Llama-2-7b-chat" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 128 , "ratio" : 0.8 },
90
- "meta-llama/Llama-2-13b-chat" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 64 , "ratio" : 0.8 },
91
- "stabilityai/stablelm-3b-4e1t" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 64 , "ratio" : 0.8 },
92
- "stablelm-epoch-3b-preview" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 64 , "ratio" : 0.8 },
93
- "stable-zephyr-3b-dpo" : {"mode " : nncf . CompressWeightsMode . INT4_ASYM , "group_size" : 64 , "ratio" : 0.8 },
94
- "pansophic/rocket-3B" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 128 , "ratio" : 0.8 },
95
- "THUDM/chatglm2-6b" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 128 , "ratio" : 0.72 },
96
- "Qwen/Qwen-7B-Chat" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 128 , "ratio" : 0.6 },
97
- "openlm-research/open_llama_3b" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 64 , "all_layers" : True },
98
- "tiiuae/falcon-7b" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 64 , "all_layers" : True },
99
- "psmathur/orca_mini_3b" : {"mode " : nncf . CompressWeightsMode . INT4_SYM , "group_size" : 64 , "all_layers" : True },
83
+ "databricks/dolly-v2-3b" : {"bits " : 4 , "sym" : False , "group_size" : 32 , "ratio" : 0.5 },
84
+ "EleutherAI/gpt-j-6b" : {"bits " : 4 , "sym" : False , "group_size" : 64 },
85
+ "facebook/opt-6.7b" : {"bits " : 4 , "sym" : False , "group_size" : 64 , "ratio" : 0.8 },
86
+ "bigscience/bloomz-7b1" : {"bits " : 4 , "sym" : False , "group_size" : 32 , "ratio" : 0.6 },
87
+ "togethercomputer/RedPajama-INCITE-7B-Instruct" : {"bits " : 4 , "sym" : False , "group_size" : 128 },
88
+ "HuggingFaceH4/zephyr-7b-beta" : {"bits " : 4 , "sym" : True , "group_size" : 64 , "ratio" : 0.6 },
89
+ "meta-llama/Llama-2-7b" : {"bits " : 4 , "sym" : True , "group_size" : 128 , "ratio" : 0.6 },
90
+ "meta-llama/Llama-2-7b-chat" : {"bits " : 4 , "sym" : True , "group_size" : 128 , "ratio" : 0.8 },
91
+ "meta-llama/Llama-2-13b-chat" : {"bits " : 4 , "sym" : True , "group_size" : 64 , "ratio" : 0.8 },
92
+ "stabilityai/stablelm-3b-4e1t" : {"bits " : 4 , "sym" : True , "group_size" : 64 , "ratio" : 0.8 },
93
+ "stablelm-epoch-3b-preview" : {"bits " : 4 , "sym" : True , "group_size" : 64 , "ratio" : 0.8 },
94
+ "stable-zephyr-3b-dpo" : {"bits " : 4 , "sym" : False , "group_size" : 64 , "ratio" : 0.8 },
95
+ "pansophic/rocket-3B" : {"bits " : 4 , "sym" : True , "group_size" : 128 , "ratio" : 0.8 },
96
+ "THUDM/chatglm2-6b" : {"bits " : 4 , "sym" : True , "group_size" : 128 , "ratio" : 0.72 },
97
+ "Qwen/Qwen-7B-Chat" : {"bits " : 4 , "sym" : True , "group_size" : 128 , "ratio" : 0.6 },
98
+ "openlm-research/open_llama_3b" : {"bits " : 4 , "sym" : True , "group_size" : 64 , "all_layers" : True },
99
+ "tiiuae/falcon-7b" : {"bits " : 4 , "sym" : True , "group_size" : 64 , "all_layers" : True },
100
+ "psmathur/orca_mini_3b" : {"bits " : 4 , "sym" : True , "group_size" : 64 , "all_layers" : True },
100
101
}
101
102
102
103
@@ -159,8 +160,11 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
159
160
loaded using `optimum-intel` api for quantization with NNCF.
160
161
161
162
Args:
162
- mode (`nncf.CompressWeightsMode`, *optional*, defaults to INT8_ASYM):
163
- The model defines the weight compressoin method (4-bit, 8-bit, etc.) available in nncf.compress_weights nncf.CompressWeightsMode.
163
+
164
+ bits (`int`, defaults to 8):
165
+ The number of bits to quantize to.
166
+ sym (`bool`, *optional*, defaults to `False`):
167
+ Whether to use symetric quantization.
164
168
tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
165
169
The tokenizer used to process the dataset. You can pass either:
166
170
- A custom tokenizer object.
@@ -191,26 +195,27 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
191
195
192
196
def __init__ (
193
197
self ,
194
- mode = None ,
198
+ bits : int = 8 ,
199
+ sym : bool = False ,
195
200
tokenizer : Any = None ,
196
- dataset : Optional [Union [ nncf . Dataset , str ] ] = None ,
201
+ dataset : Optional [str ] = None ,
197
202
ratio : Optional [float ] = None ,
198
203
group_size : Optional [int ] = None ,
199
204
all_layers : Optional [bool ] = None ,
200
- sensitivity_metric : Optional [nncf .SensitivityMetric ] = None ,
201
- awq : Optional [bool ] = None ,
202
- ignored_scope : Optional [nncf .IgnoredScope ] = None ,
205
+ sensitivity_metric : Optional [str ] = None ,
206
+ ignored_scope : Optional [dict ] = None ,
203
207
** kwargs ,
204
208
):
205
- self .mode = mode
209
+ self .bits = bits
210
+ self .sym = sym
206
211
self .tokenizer = tokenizer
207
212
self .dataset = dataset
208
213
self .group_size = group_size
209
214
self .ratio = ratio
210
- self .ignored_scope = ignored_scope
211
215
self .all_layers = all_layers
212
216
self .sensitivity_metric = sensitivity_metric
213
- self .awq = awq
217
+ self .ignored_scope = ignored_scope
218
+ self .quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release
214
219
self .post_init ()
215
220
216
221
def post_init (self ):
@@ -229,5 +234,9 @@ def post_init(self):
229
234
)
230
235
231
236
237
+ if self .bits not in [4 , 8 ]:
238
+ raise ValueError (f"Only support quantization to [4,8] bits but found { self .bits } " )
239
+
240
+
232
241
def _check_default_4bit_configs (config : PretrainedConfig ):
233
242
return DEFAULT_4BIT_CONFIGS .get (config .name_or_path , None )
0 commit comments