19
19
from typing import TYPE_CHECKING , Optional
20
20
21
21
from huggingface_hub .constants import HUGGINGFACE_HUB_CACHE
22
+ from transformers .utils .quantization_config import QuantizationMethod
22
23
23
24
from ...exporters import TasksManager
24
25
from ...intel .utils .import_utils import DIFFUSERS_IMPORT_ERROR , is_diffusers_available
26
+ from ...utils .save_utils import maybe_load_preprocessors , maybe_save_preprocessors
25
27
from ..base import BaseOptimumCLICommand , CommandInfo
26
28
27
29
@@ -128,6 +130,33 @@ def parse_args_openvino(parser: "ArgumentParser"):
128
130
"compression is applied, they are compressed to INT8."
129
131
),
130
132
)
133
+ optional_group .add_argument (
134
+ "--awq" ,
135
+ action = "store_true" ,
136
+ default = None ,
137
+ help = (
138
+ "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires "
139
+ "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset "
140
+ "argument. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in such "
141
+ "case it will be skipped."
142
+ ),
143
+ )
144
+ optional_group .add_argument (
145
+ "--sensitivity-metric" ,
146
+ type = str ,
147
+ default = None ,
148
+ help = (
149
+ "The sensitivity metric for assigning quantization precision to layers. Can be one of the following: "
150
+ "['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', "
151
+ "'max_activation_variance', 'mean_activation_magnitude']."
152
+ ),
153
+ )
154
+ optional_group .add_argument (
155
+ "--num-samples" ,
156
+ type = int ,
157
+ default = None ,
158
+ help = "The maximum number of samples to take from the dataset for quantization." ,
159
+ )
131
160
optional_group .add_argument (
132
161
"--disable-stateful" ,
133
162
action = "store_true" ,
@@ -180,7 +209,7 @@ def parse_args(parser: "ArgumentParser"):
180
209
return parse_args_openvino (parser )
181
210
182
211
def run (self ):
183
- from ...exporters .openvino .__main__ import main_export
212
+ from ...exporters .openvino .__main__ import infer_task , main_export , maybe_convert_tokenizers
184
213
from ...intel .openvino .configuration import _DEFAULT_4BIT_CONFIGS , OVConfig
185
214
186
215
if self .args .fp16 :
@@ -208,6 +237,10 @@ def run(self):
208
237
and self .args .group_size is None
209
238
and self .args .sym is None
210
239
and self .args .all_layers is None
240
+ and self .args .dataset is None
241
+ and self .args .num_samples is None
242
+ and self .args .awq is None
243
+ and self .args .sensitivity_metric is None
211
244
and self .args .model in _DEFAULT_4BIT_CONFIGS
212
245
):
213
246
quantization_config = _DEFAULT_4BIT_CONFIGS [self .args .model ]
@@ -218,6 +251,10 @@ def run(self):
218
251
"sym" : self .args .sym or False ,
219
252
"group_size" : - 1 if is_int8 else self .args .group_size ,
220
253
"all_layers" : None if is_int8 else self .args .all_layers ,
254
+ "dataset" : self .args .dataset ,
255
+ "num_samples" : self .args .num_samples ,
256
+ "quant_method" : QuantizationMethod .AWQ if self .args .awq else None ,
257
+ "sensitivity_metric" : self .args .sensitivity_metric ,
221
258
}
222
259
223
260
if self .args .weight_format in {"int4_sym_g128" , "int4_asym_g128" , "int4_sym_g64" , "int4_asym_g64" }:
@@ -226,7 +263,6 @@ def run(self):
226
263
)
227
264
quantization_config ["sym" ] = "asym" not in self .args .weight_format
228
265
quantization_config ["group_size" ] = 128 if "128" in self .args .weight_format else 64
229
- quantization_config ["dataset" ] = self .args .dataset
230
266
ov_config = OVConfig (quantization_config = quantization_config )
231
267
232
268
library_name = TasksManager .infer_library_from_model (self .args .model , library_name = self .args .library )
@@ -240,12 +276,11 @@ def run(self):
240
276
if self .args .convert_tokenizer :
241
277
logger .warning ("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default." )
242
278
243
- if (
244
- library_name == "diffusers"
245
- and ov_config
246
- and ov_config .quantization_config
247
- and ov_config .quantization_config .dataset is not None
248
- ):
279
+ quantization_config = ov_config .quantization_config if ov_config else None
280
+ quantize_with_dataset = quantization_config and getattr (quantization_config , "dataset" , None ) is not None
281
+ task = infer_task (self .args .task , self .args .model )
282
+
283
+ if library_name == "diffusers" and quantize_with_dataset :
249
284
if not is_diffusers_available ():
250
285
raise ValueError (DIFFUSERS_IMPORT_ERROR .format ("Export of diffusers models" ))
251
286
@@ -270,25 +305,29 @@ def run(self):
270
305
else :
271
306
raise NotImplementedError (f"Quantization in hybrid mode isn't supported for class { class_name } ." )
272
307
273
- model = model_cls .from_pretrained (
274
- self .args .model , export = True , quantization_config = ov_config .quantization_config
308
+ model = model_cls .from_pretrained (self .args .model , export = True , quantization_config = quantization_config )
309
+ model .save_pretrained (self .args .output )
310
+ if not self .args .disable_convert_tokenizer :
311
+ maybe_convert_tokenizers (library_name , self .args .output , model )
312
+ elif task .startswith ("text-generation" ) and quantize_with_dataset :
313
+ from optimum .intel import OVModelForCausalLM
314
+
315
+ # To quantize a text-generation model with a dataset, an instantiated OVModelForCausalLM is required
316
+ model = OVModelForCausalLM .from_pretrained (
317
+ self .args .model ,
318
+ export = True ,
319
+ quantization_config = quantization_config ,
320
+ stateful = not self .args .disable_stateful ,
321
+ trust_remote_code = self .args .trust_remote_code ,
275
322
)
276
323
model .save_pretrained (self .args .output )
277
324
278
- if self .args .disable_convert_tokenizer :
279
- return
280
-
281
- # avoid import when using other exporters (IPEX, INC)
282
- from ...exporters .openvino .convert import export_tokenizer
283
-
284
- output = Path (self .args .output )
285
- tokenizer = getattr (model , "tokenizer" , None )
286
- if tokenizer is not None :
287
- export_tokenizer (tokenizer , output / "tokenizer" )
288
-
289
- tokenizer_2 = getattr (model , "tokenizer_2" , None )
290
- if tokenizer_2 is not None :
291
- export_tokenizer (tokenizer_2 , output / "tokenizer_2" )
325
+ maybe_save_preprocessors (self .args .model , self .args .output , trust_remote_code = self .args .trust_remote_code )
326
+ if not self .args .disable_convert_tokenizer :
327
+ preprocessors = maybe_load_preprocessors (
328
+ self .args .model , trust_remote_code = self .args .trust_remote_code
329
+ )
330
+ maybe_convert_tokenizers (library_name , self .args .output , preprocessors = preprocessors )
292
331
else :
293
332
# TODO : add input shapes
294
333
main_export (
0 commit comments