|
57 | 57 | from transformers.utils.versions import require_version
|
58 | 58 |
|
59 | 59 | from optimum.intel.neural_compressor import INCModelForCausalLM, INCQuantizer, INCTrainer
|
60 |
| -from optimum.intel.utils.import_utils import ( |
61 |
| - INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR, |
62 |
| - is_intel_extension_for_transformers_available, |
63 |
| -) |
64 |
| - |
| 60 | +from optimum.intel.utils.import_utils import ITREX_IMPORT_ERROR, is_itrex_available |
65 | 61 |
|
66 |
| -if is_intel_extension_for_transformers_available(): |
67 |
| - from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig |
68 | 62 |
|
| 63 | +if is_itrex_available(): |
| 64 | + from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig |
69 | 65 |
|
70 | 66 | os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
71 | 67 |
|
@@ -227,8 +223,9 @@ class OptimizationArguments:
|
227 | 223 | metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
|
228 | 224 | )
|
229 | 225 | quantization_methodology: str = field(
|
230 |
| - default="RTN", |
231 |
| - metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."}, |
| 226 | + choices=["rtn", "gptq"], |
| 227 | + default="rtn", |
| 228 | + metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."}, |
232 | 229 | )
|
233 | 230 | damp_percent: float = field(
|
234 | 231 | default=0.01,
|
@@ -658,26 +655,27 @@ def compute_metrics(eval_preds):
|
658 | 655 | else:
|
659 | 656 | recipes = {}
|
660 | 657 | if optim_args.quantization_approach == "weight_only":
|
661 |
| - if not is_intel_extension_for_transformers_available(): |
662 |
| - raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization")) |
| 658 | + if not is_itrex_available(): |
| 659 | + raise ImportError(ITREX_IMPORT_ERROR.format("WeightOnly quantization")) |
663 | 660 | if optim_args.apply_pruning or optim_args.apply_distillation:
|
664 | 661 | raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
|
665 |
| - if optim_args.quantization_methodology == "GPTQ": |
666 |
| - algorithm_args = { |
667 |
| - "act_order": False, |
668 |
| - "percdamp": optim_args.damp_percent, |
669 |
| - "block_size": optim_args.gptq_block_size, |
670 |
| - "nsamples": optim_args.num_calibration_samples, |
671 |
| - "use_max_length": optim_args.use_max_length, |
672 |
| - "pad_max_length": optim_args.pad_max_length, |
673 |
| - } |
674 |
| - quantization_config = WeightOnlyQuantConfig( |
675 |
| - weight_dtype=optim_args.weight_dtype, |
676 |
| - group_size=optim_args.group_size, |
677 |
| - scheme=optim_args.weight_only_scheme, |
678 |
| - algorithm=optim_args.quantization_methodology, |
679 |
| - algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None, |
680 |
| - ) |
| 662 | + |
| 663 | + algorithm_args = { |
| 664 | + "weight_dtype": optim_args.weight_dtype, |
| 665 | + "sym": optim_args.weight_only_scheme == "sym", |
| 666 | + "group_size": optim_args.group_size, |
| 667 | + } |
| 668 | + |
| 669 | + if optim_args.quantization_methodology == "gptq": |
| 670 | + quantization_config = GPTQConfig( |
| 671 | + damp_percent=optim_args.damp_percent, |
| 672 | + nsamples=optim_args.num_calibration_samples, |
| 673 | + blocksize=optim_args.gptq_block_size, |
| 674 | + **algorithm_args, |
| 675 | + ) |
| 676 | + else: |
| 677 | + quantization_config = RtnConfig(**algorithm_args) |
| 678 | + |
681 | 679 | else:
|
682 | 680 | quantization_config = PostTrainingQuantConfig(
|
683 | 681 | approach=optim_args.quantization_approach, recipes=recipes
|
|
0 commit comments