17
17
import json
18
18
import math
19
19
import os
20
+ import re
20
21
import types
21
22
22
23
from datasets import load_dataset
33
34
convert ,
34
35
prepare ,
35
36
)
36
- from neural_compressor .torch .utils import is_ipex_available
37
+ from neural_compressor .torch .utils import is_ipex_available , is_package_available
37
38
38
39
if is_ipex_available ():
39
40
import intel_extension_for_pytorch as ipex
40
41
42
+ if is_package_available ("auto_round" ):
43
+ import auto_round
44
+ import transformers
45
+ from auto_round .export .export_to_itrex .model_wrapper import WeightOnlyLinear as auto_round_woq_linear
46
+
41
47
from typing import Union
42
48
43
49
torch = LazyImport ("torch" )
@@ -126,10 +132,12 @@ def _replace_linear(
126
132
if (
127
133
isinstance (module , torch .nn .Linear )
128
134
or isinstance (module , INCWeightOnlyLinear )
129
- or (is_ipex_available ( ) and isinstance (module , ipex . nn . utils . _weight_prepack . _IPEXLinear ))
135
+ or (is_package_available ( "auto_round" ) and isinstance (module , auto_round_woq_linear ))
130
136
) and (name not in modules_to_not_convert ):
131
137
# Check if the current key is not in the `modules_to_not_convert`
132
- if not any (key in "." .join (current_key_name ) for key in modules_to_not_convert ):
138
+ if not any (key in "." .join (current_key_name ) for key in modules_to_not_convert ) and not any (
139
+ re .match (pattern , "." .join (current_key_name )) for pattern in modules_to_not_convert
140
+ ):
133
141
in_features = module .in_features
134
142
out_features = module .out_features
135
143
if device == "cpu" or device == torch .device ("cpu" ) or device == "auto" :
@@ -475,6 +483,54 @@ def convert_to_quantized_model(model, config, device="cpu"):
475
483
run_fn (model , * run_args )
476
484
model = convert (model )
477
485
elif config .quant_method .value == "autoround" :
486
+ if config .is_vlm is True :
487
+ from transformers import AutoProcessor , AutoTokenizer
488
+
489
+ from neural_compressor .torch .algorithms .weight_only .autoround import (
490
+ get_mllm_dataloader as get_autoround_dataloader ,
491
+ )
492
+
493
+ tokenizer = AutoTokenizer .from_pretrained (model .config ._name_or_path )
494
+ processor = AutoProcessor .from_pretrained (model .config ._name_or_path , trust_remote_code = True )
495
+ (
496
+ dataloader ,
497
+ template ,
498
+ config .truncation ,
499
+ config .batch_size ,
500
+ config .gradient_accumulate_steps ,
501
+ config .seq_len ,
502
+ config .n_samples ,
503
+ ) = get_autoround_dataloader (
504
+ template = None ,
505
+ model = model ,
506
+ tokenizer = tokenizer ,
507
+ image_processor = None ,
508
+ dataset = config .dataset ,
509
+ extra_data_dir = None ,
510
+ seqlen = config .seq_len ,
511
+ batch_size = config .batch_size ,
512
+ split = None ,
513
+ apply_template = None ,
514
+ truncation = False ,
515
+ nsamples = config .n_samples ,
516
+ seed = 42 ,
517
+ gradient_accumulate_steps = config .gradient_accumulate_steps ,
518
+ quant_nontext_module = config .quant_nontext_module ,
519
+ processor = processor ,
520
+ )
521
+ else :
522
+ from neural_compressor .torch .algorithms .weight_only .autoround import (
523
+ get_dataloader as get_autoround_dataloader ,
524
+ )
525
+
526
+ dataloader = get_autoround_dataloader (
527
+ tokenizer = config .tokenizer ,
528
+ seqlen = config .seq_len ,
529
+ dataset_name = config .dataset ,
530
+ seed = 42 ,
531
+ bs = config .batch_size ,
532
+ nsamples = config .n_samples ,
533
+ )
478
534
quant_config = AutoRoundConfig (
479
535
dtype = dtype ,
480
536
bits = config .bits ,
@@ -486,24 +542,59 @@ def convert_to_quantized_model(model, config, device="cpu"):
486
542
seqlen = config .seq_len ,
487
543
nsamples = config .n_samples ,
488
544
iters = config .iters ,
545
+ batch_size = config .batch_size ,
489
546
scale_dtype = config .scale_dtype ,
490
547
use_layer_wise = config .use_layer_wise ,
548
+ # vlm arguments
549
+ is_mllm = config .is_vlm ,
550
+ quant_nontext_module = config .quant_nontext_module ,
551
+ truncation = config .truncation ,
552
+ gradient_accumulate_steps = config .gradient_accumulate_steps ,
553
+ export_format = config .export_format ,
491
554
)
555
+
556
+ # vlm set non-text module config
557
+ if config .is_vlm is True :
558
+ from neural_compressor .torch .utils .utility import (
559
+ find_matching_blocks ,
560
+ get_layer_names_in_block ,
561
+ get_multimodal_block_names ,
562
+ )
563
+
564
+ def set_nontext_module_config (model , to_quant_block_names , config ):
565
+ all_block_list = get_multimodal_block_names (model , quant_vision = True )
566
+ all_block_set = set (tuple (block ) for block in all_block_list )
567
+ quant_block_set = set (tuple (block ) for block in to_quant_block_names )
568
+ set_to_full_prec = list (all_block_set - quant_block_set )
569
+ set_to_full_prec = get_layer_names_in_block (model , to_quant_block_names = set_to_full_prec )
570
+ for name in set_to_full_prec :
571
+ config .modules_to_not_convert .append (name )
572
+
573
+ # skip layers not in blocks
574
+ config .modules_to_not_convert .append ("model.vision_embed_tokens.img_projection*" )
575
+ config .modules_to_not_convert .append ("transformer.visual.attn_pool.*_proj" )
576
+ config .modules_to_not_convert .append ("model.mm_projector*" )
577
+ config .modules_to_not_convert .append ("multi_modal_projector" )
578
+ config .modules_to_not_convert .append ("visual.merger" )
579
+
580
+ all_blocks = get_multimodal_block_names (model , quant_config .quant_nontext_module )
581
+ to_quant_block_names = find_matching_blocks (model , all_blocks , quant_config .to_quant_block_names )
582
+ set_nontext_module_config (model , to_quant_block_names , config )
583
+
584
+ for n , m in model .named_modules ():
585
+ if isinstance (m , torch .nn .Linear ) or isinstance (m , transformers .modeling_utils .Conv1D ):
586
+ if m .weight .shape [0 ] % 32 != 0 or m .weight .shape [1 ] % 32 != 0 :
587
+ config .modules_to_not_convert .append (n )
588
+ print (
589
+ f"{ n } will not be quantized due to its shape not being divisible by 32,"
590
+ " resulting in an exporting issue to autogptq"
591
+ )
492
592
if config .modules_to_not_convert != []:
493
593
for module in config .modules_to_not_convert :
494
594
module_name = ".*" + module
495
595
quant_config .set_local (module_name , AutoRoundConfig (dtype = "fp32" ))
496
596
logger .info (f"Do AutoRound algorithm with config { quant_config } " )
497
- from neural_compressor .torch .algorithms .weight_only .autoround import get_dataloader as get_autoround_dataloader
498
597
499
- dataloader = get_autoround_dataloader (
500
- tokenizer = config .tokenizer ,
501
- seqlen = config .seq_len ,
502
- dataset_name = config .dataset ,
503
- seed = 42 ,
504
- bs = config .batch_size ,
505
- nsamples = config .n_samples ,
506
- )
507
598
run_fn = run_fn_for_autoround
508
599
run_args = (dataloader ,)
509
600
model = prepare (model = model , quant_config = quant_config )
0 commit comments