|
45 | 45 | )
|
46 | 46 |
|
47 | 47 | from optimum.intel.neural_compressor import INCModelForCausalLM, INCQuantizer
|
| 48 | +from optimum.intel.utils.import_utils import ( |
| 49 | + INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR, |
| 50 | + is_intel_extension_for_transformers_available, |
| 51 | +) |
| 52 | + |
| 53 | + |
| 54 | +if is_intel_extension_for_transformers_available(): |
| 55 | + from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig |
48 | 56 |
|
49 | 57 |
|
50 | 58 | logging.basicConfig(
|
@@ -281,6 +289,69 @@ def main():
|
281 | 289 | )
|
282 | 290 | parser.add_argument("--dataset_name", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
|
283 | 291 | parser.add_argument("--calib_iters", default=100, type=int, help="calibration iters.")
|
| 292 | + parser.add_argument( |
| 293 | + "--bits", |
| 294 | + default="4", |
| 295 | + type=str, |
| 296 | + help="Bits number of weight for weight only quantization. 1~8 bits.", |
| 297 | + ) |
| 298 | + parser.add_argument( |
| 299 | + "--weight_dtype", |
| 300 | + default="int4_clip", |
| 301 | + type=str, |
| 302 | + help="weight dtype for weight only quantization.", |
| 303 | + ) |
| 304 | + parser.add_argument( |
| 305 | + "--group_size", |
| 306 | + default=32, |
| 307 | + type=int, |
| 308 | + help="Group size for weight only quantization. Group_size=[1-N] indicates " |
| 309 | + "splitting the input channel elements per group_size. -1 indicates " |
| 310 | + "the per-channel quantization per output channel.", |
| 311 | + ) |
| 312 | + parser.add_argument( |
| 313 | + "--weight_only_scheme", |
| 314 | + default="sym", |
| 315 | + type=str, |
| 316 | + help="Scheme for weight only quantization. Choose from 'sym' and 'asym'.", |
| 317 | + ) |
| 318 | + parser.add_argument( |
| 319 | + "--quantization_methodology", |
| 320 | + choices=["rtn", "gptq"], |
| 321 | + default="rtn", |
| 322 | + type=str, |
| 323 | + help="Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'.", |
| 324 | + ) |
| 325 | + parser.add_argument( |
| 326 | + "--damp_percent", |
| 327 | + default=0.01, |
| 328 | + type=float, |
| 329 | + help="Percentage of Hessian's diagonal values average, which will be added to Hessian's diagonal to increase numerical stability, used for GPTQ quantization", |
| 330 | + ) |
| 331 | + parser.add_argument( |
| 332 | + "--gptq_block_size", |
| 333 | + default=128, |
| 334 | + type=int, |
| 335 | + help="Block size. sub weight matrix size to run GPTQ.", |
| 336 | + ) |
| 337 | + parser.add_argument( |
| 338 | + "--num_calibration_samples", |
| 339 | + default=128, |
| 340 | + type=int, |
| 341 | + help="Number of examples to use for the GPTQ calibration step." |
| 342 | + ) |
| 343 | + parser.add_argument( |
| 344 | + "--use_max_length", |
| 345 | + default=False, |
| 346 | + type=bool, |
| 347 | + help="Set all sequence length to be same length of args.gptq_pad_max_length", |
| 348 | + ) |
| 349 | + parser.add_argument( |
| 350 | + "--pad_max_length", |
| 351 | + default=2048, |
| 352 | + type=int, |
| 353 | + help="Calibration dataset sequence max length, this should align with your model config", |
| 354 | + ) |
284 | 355 | args = parser.parse_args()
|
285 | 356 |
|
286 | 357 | args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
@@ -313,6 +384,43 @@ def main():
|
313 | 384 | model.to(args.device)
|
314 | 385 |
|
315 | 386 | if args.apply_quantization:
|
| 387 | + supported_approach = {"static", "dynamic", "weight_only"} |
| 388 | + if args.quantization_approach not in supported_approach: |
| 389 | + raise ValueError( |
| 390 | + f"Unknown quantization approach. Supported approach are {supported_approach}." |
| 391 | + f"{args.quantization_approach} was given." |
| 392 | + ) |
| 393 | + if args.quantization_approach == "weight_only": |
| 394 | + if not is_intel_extension_for_transformers_available(): |
| 395 | + raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization")) |
| 396 | + |
| 397 | + algorithm_args = { |
| 398 | + "weight_dtype": args.weight_dtype, |
| 399 | + "sym": args.weight_only_scheme == "sym", |
| 400 | + "group_size": args.group_size, |
| 401 | + } |
| 402 | + |
| 403 | + if args.quantization_methodology == "gptq": |
| 404 | + quantization_config = GPTQConfig( |
| 405 | + damp_percent=args.damp_percent, |
| 406 | + nsamples=args.num_calibration_samples, |
| 407 | + blocksize=args.gptq_block_size, |
| 408 | + **algorithm_args, |
| 409 | + ) |
| 410 | + else: |
| 411 | + quantization_config = RtnConfig(**algorithm_args) |
| 412 | + |
| 413 | + else: |
| 414 | + example_inputs = {"input_ids": torch.randint(100, (1, 32)), "attention_mask": torch.ones(1, 32)} |
| 415 | + quantization_config = PostTrainingQuantConfig( |
| 416 | + approach=args.quantization_approach, |
| 417 | + recipes={ |
| 418 | + "smooth_quant": args.smooth_quant, |
| 419 | + "smooth_quant_args": {"alpha": args.smooth_quant_alpha, "folding": True}, |
| 420 | + }, |
| 421 | + example_inputs=example_inputs, |
| 422 | + ) |
| 423 | + model.config.return_dict = False |
316 | 424 | # This is just an example for calibration_fn. If you want to achieve good accuracy,
|
317 | 425 | # you must perform a calibration on your real dataset.
|
318 | 426 | calib_dataset = load_dataset(args.dataset_name, split="train")
|
@@ -347,16 +455,6 @@ def calibration_fn(p_model):
|
347 | 455 | do_sample=False,
|
348 | 456 | )
|
349 | 457 |
|
350 |
| - example_inputs = {"input_ids": torch.randint(100, (1, 32)), "attention_mask": torch.ones(1, 32)} |
351 |
| - quantization_config = PostTrainingQuantConfig( |
352 |
| - approach=args.quantization_approach, |
353 |
| - recipes={ |
354 |
| - "smooth_quant": args.smooth_quant, |
355 |
| - "smooth_quant_args": {"alpha": args.smooth_quant_alpha, "folding": True}, |
356 |
| - }, |
357 |
| - example_inputs=example_inputs, |
358 |
| - ) |
359 |
| - model.config.return_dict = False |
360 | 458 | quantizer = INCQuantizer.from_pretrained(model, calibration_fn=calibration_fn)
|
361 | 459 | with tempfile.TemporaryDirectory() as tmp_dir:
|
362 | 460 | quantizer.quantize(
|
|
0 commit comments