@@ -240,8 +240,10 @@ def __init__(
240
240
self .nsamples = nsamples
241
241
242
242
def prepare_layer_wise (self , model_path ):
243
- from neural_compressor .torch .algorithms .layer_wise import LWQ_WORKSPACE , get_path , register_weight_hooks
244
243
import os
244
+
245
+ from neural_compressor .torch .algorithms .layer_wise import LWQ_WORKSPACE , get_path , register_weight_hooks
246
+
245
247
os .makedirs (LWQ_WORKSPACE , exist_ok = True )
246
248
if model_path == "" :
247
249
model_path = self .model .path
@@ -250,7 +252,7 @@ def prepare_layer_wise(self, model_path):
250
252
register_weight_hooks (
251
253
self .model , self .model_path , device = self .device , clean_weight = True , saved_path = LWQ_WORKSPACE
252
254
)
253
-
255
+
254
256
def get_full_layer_name (self , sub_layer_name , block_idx ):
255
257
transformer_name = self .gptq_related_blocks ["transformers_name" ]
256
258
return "." .join ([transformer_name , str (block_idx ), sub_layer_name ])
@@ -443,6 +445,7 @@ def execute_quantization(self, means=None, stds=None):
443
445
weight_config_this_layer = self .get_layer_config (full_layer_name )
444
446
if self .use_layer_wise : # pragma: no cover
445
447
from neural_compressor .torch .algorithms .layer_wise import load_value
448
+
446
449
W = load_value (self .model , full_layer_name + ".weight" , self .model_path )
447
450
else :
448
451
W = sub_layers [layer_name ].weight .data .clone ()
@@ -489,10 +492,10 @@ def tmp(_, inp, out):
489
492
else :
490
493
value = load_value (self .model , param_name , self .model_path )
491
494
set_module_tensor_to_device (self .model , param_name , self .device , value )
492
-
495
+
493
496
else :
494
497
W = sub_layers [layer_name ].weight .data .clone ()
495
-
498
+
496
499
accelerator .mark_step ()
497
500
if "hpu" in self .device :
498
501
W = W .to ("cpu" )
@@ -504,7 +507,7 @@ def tmp(_, inp, out):
504
507
act_order = weight_config_this_layer ["act_order" ],
505
508
static_groups = weight_config_this_layer ["static_groups" ],
506
509
)
507
-
510
+
508
511
# Step 2.5: export to compressed model
509
512
gptq_config [self .get_full_layer_name (layer_name , block_idx )] = {"scale" : scale }
510
513
if not weight_config_this_layer ["sym" ]:
@@ -513,7 +516,7 @@ def tmp(_, inp, out):
513
516
gptq_config [self .get_full_layer_name (layer_name , block_idx )]["perm" ] = gptq_for_this_block [
514
517
layer_name
515
518
].perm
516
-
519
+
517
520
weight_config_this_layer = self .get_layer_config (self .get_full_layer_name (layer_name , block_idx ))
518
521
gptq_scale = gptq_config [self .get_full_layer_name (layer_name , block_idx )]["scale" ]
519
522
if not weight_config_this_layer ["sym" ]:
@@ -564,7 +567,7 @@ def tmp(_, inp, out):
564
567
device = self .device ,
565
568
)
566
569
new_module .pack (int_weight , gptq_scale , gptq_zp , sub_layers [layer_name ].bias , gptq_perm )
567
-
570
+
568
571
if self .use_layer_wise : # pragma: no cover
569
572
from neural_compressor .torch .algorithms .layer_wise import (
570
573
LWQ_WORKSPACE ,
@@ -595,8 +598,7 @@ def tmp(_, inp, out):
595
598
self .gptq_related_blocks ["transformers" ][block_idx ] = transformer_block
596
599
else :
597
600
self .gptq_related_blocks ["transformers" ][block_idx ] = transformer_block .cpu ()
598
-
599
-
601
+
600
602
del gptq_for_this_block
601
603
torch .cuda .empty_cache ()
602
604
# iteratively replace the input with output, thus layerwise quantization can continue.
0 commit comments