63
63
if is_intel_extension_for_transformers_available ():
64
64
from intel_extension_for_transformers .transformers .utils .config import WeightOnlyQuantConfig
65
65
66
+ from optimum .intel .neural_compressor import ITREXAutoModelForCausalLM
67
+
66
68
os .environ ["CUDA_VISIBLE_DEVICES" ] = ""
67
69
68
70
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -147,7 +149,9 @@ class OptimizationArguments:
147
149
)
148
150
quantization_approach : str = field (
149
151
default = "dynamic" ,
150
- metadata = {"help" : "Quantization approach. Supported approach are static, dynamic and aware_training." },
152
+ metadata = {
153
+ "help" : "Quantization approach. Supported approach are static, dynamic aware_training and weight_only."
154
+ },
151
155
)
152
156
smooth_quant : bool = field (
153
157
default = False ,
@@ -200,8 +204,12 @@ class OptimizationArguments:
200
204
default = False ,
201
205
metadata = {"help" : "Whether or not to verify the loading of the quantized model." },
202
206
)
207
+ bits : str = field (
208
+ default = "4" ,
209
+ metadata = {"help" : "Bits number of weight for weight only quantization. 1~8 bits." },
210
+ )
203
211
weight_dtype : str = field (
204
- default = "int8 " ,
212
+ default = "int4_clip " ,
205
213
metadata = {"help" : "weight dtype for weight only quantization." },
206
214
)
207
215
group_size : int = field (
@@ -218,9 +226,24 @@ class OptimizationArguments:
218
226
)
219
227
quantization_methodology : str = field (
220
228
default = "RTN" ,
221
- metadata = {
222
- "help" : "Quantization methodology for weight only quantization. Choose from 'RTN', 'AWQ' and 'GPTQ'."
223
- },
229
+ metadata = {"help" : "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'." },
230
+ )
231
+ gptq_percdamp : float = field (
232
+ default = 0.01 ,
233
+ metadata = {"help" : "Percent of the average Hessian diagonal to use for dampening." },
234
+ )
235
+ gptq_block_size : int = field (
236
+ default = 128 ,
237
+ metadata = {"help" : "Block size. sub weight matrix size to run GPTQ." },
238
+ )
239
+ gptq_nsamples : int = field (default = 128 , metadata = {"help" : "Number of calibration data samples." })
240
+ gptq_use_max_length : bool = field (
241
+ default = False ,
242
+ metadata = {"help" : "Set all sequence length to be same length of args.gptq_pad_max_length" },
243
+ )
244
+ gptq_pad_max_length : int = field (
245
+ default = 2048 ,
246
+ metadata = {"help" : "Calibration dataset sequence max length, this should align with your model config" },
224
247
)
225
248
226
249
@@ -636,11 +659,21 @@ def compute_metrics(eval_preds):
636
659
)
637
660
if optim_args .apply_pruning or optim_args .apply_distillation :
638
661
raise ValueError ("Weight only quantization and pruning or distillation cannot be combined." )
662
+ if optim_args .quantization_methodology == "GPTQ" :
663
+ algorithm_args = {
664
+ "act_order" : False ,
665
+ "percdamp" : optim_args .gptq_percdamp ,
666
+ "block_size" : optim_args .gptq_block_size ,
667
+ "nsamples" : optim_args .gptq_nsamples ,
668
+ "use_max_length" : optim_args .gptq_use_max_length ,
669
+ "pad_max_length" : optim_args .gptq_pad_max_length ,
670
+ }
639
671
quantization_config = WeightOnlyQuantConfig (
640
672
weight_dtype = optim_args .weight_dtype ,
641
673
group_size = optim_args .group_size ,
642
674
scheme = optim_args .weight_only_scheme ,
643
675
algorithm = optim_args .quantization_methodology ,
676
+ algorithm_args = algorithm_args if optim_args .quantization_methodology == "GPTQ" else None ,
644
677
)
645
678
else :
646
679
quantization_config = PostTrainingQuantConfig (
@@ -733,17 +766,20 @@ def compute_metrics(eval_preds):
733
766
quantizer .quantize (
734
767
quantization_config = quantization_config ,
735
768
save_directory = training_args .output_dir ,
736
- calibration_dataset = train_dataset
737
- if optim_args .quantization_approach in ["static" , "weight_only" ]
738
- else None ,
739
- batch_size = 1
740
- if optim_args .quantization_approach == "weight_only"
741
- else training_args . per_device_train_batch_size ,
769
+ calibration_dataset = (
770
+ train_dataset if optim_args .quantization_approach in ["static" , "weight_only" ] else None
771
+ ) ,
772
+ batch_size = (
773
+ 1 if optim_args .quantization_approach == "weight_only" else training_args . per_device_train_batch_size
774
+ ) ,
742
775
)
743
776
trainer .model = quantizer ._quantized_model
744
777
745
778
if optim_args .apply_quantization and optim_args .verify_loading :
746
- loaded_model = INCModelForCausalLM .from_pretrained (training_args .output_dir )
779
+ if optim_args .quantization_approach == "weight_only" :
780
+ loaded_model = ITREXAutoModelForCausalLM .from_pretrained (training_args .output_dir )
781
+ else :
782
+ loaded_model = INCModelForCausalLM .from_pretrained (training_args .output_dir )
747
783
tokens = tokenizer ("This is a sample input" , return_tensors = "pt" )
748
784
with torch .no_grad ():
749
785
original_model_outputs = trainer .model (** tokens )
0 commit comments