@@ -45,11 +45,13 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
45
45
ratio (`float`, *optional*, defaults to 1.0):
46
46
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
47
47
and the rest to INT8_ASYM).
48
- all_layers (`bool`, *optional*, defaults to False ):
48
+ all_layers (`bool`, *optional*):
49
49
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion.
50
50
sensitivity_metric (`nncf.SensitivityMetric`, *optional*):
51
51
The sensitivity metric for assigning quantization precision to layers. In order to
52
52
preserve the accuracy of the model, the more sensitive layers receives a higher precision.
53
+ awq (`bool`, *optional*):
54
+ Enables AWQ method to unify weight ranges and improve overall model accuracy.
53
55
ignored_scope (`nncf.IgnoredScope`, *optional*):
54
56
An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.
55
57
@@ -62,9 +64,10 @@ def __init__(
62
64
dataset : Optional [Union [nncf .Dataset , str ]] = None ,
63
65
ratio : Optional [float ] = None ,
64
66
group_size : Optional [int ] = None ,
65
- ignored_scope : Optional [nncf .IgnoredScope ] = None ,
66
67
all_layers : Optional [bool ] = None ,
67
68
sensitivity_metric : Optional [nncf .SensitivityMetric ] = None ,
69
+ awq : Optional [bool ] = None ,
70
+ ignored_scope : Optional [nncf .IgnoredScope ] = None ,
68
71
** kwargs ,
69
72
):
70
73
self .mode = mode
@@ -75,6 +78,7 @@ def __init__(
75
78
self .ignored_scope = ignored_scope
76
79
self .all_layers = all_layers
77
80
self .sensitivity_metric = sensitivity_metric
81
+ self .awq = awq
78
82
self .post_init ()
79
83
80
84
def post_init (self ):
@@ -92,25 +96,25 @@ def post_init(self):
92
96
['wikitext2','c4','c4-new','ptb','ptb-new'], but we found { self .dataset } """
93
97
)
94
98
99
+ DEFAULT_4BIT_CONFIGS = {
100
+ "dolly-v2-3b" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 32 , "ratio" : 0.5 },
101
+ "gpt-j-6b" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 64 },
102
+ "opt-6.7b" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 64 , "ratio" : 0.8 },
103
+ "bloomz-7b1" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 32 , "ratio" : 0.6 },
104
+ "red-pajama-incite-7b-instruct" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 128 },
105
+ "zephyr-7b-beta" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 64 , "ratio" : 0.6 },
106
+ "llama-2-7b" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.6 },
107
+ "llama-2-7b-chat" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.8 },
108
+ "llama-2-13b-chat" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 64 , "ratio" : 0.8 },
109
+ "stablelm-3b-4e1t" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 64 , "ratio" : 0.8 },
110
+ "stablelm-epoch-3b-preview" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 64 , "ratio" : 0.8 },
111
+ "stable-zephyr-3b-dpo" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 64 , "ratio" : 0.8 },
112
+ "rocket-3b" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.8 },
113
+ "chatglm2-6b" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.72 },
114
+ "qwen-7b-chat" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.6 },
115
+ }
95
116
96
117
def _check_default_4bit_configs (config : PretrainedConfig ):
97
- DEFAULT_4BIT_CONFIGS = {
98
- "dolly-v2-3b" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 32 , "ratio" : 0.5 },
99
- "gpt-j-6b" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 64 },
100
- "opt-6.7b" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 64 , "ratio" : 0.8 },
101
- "bloomz-7b1" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 32 , "ratio" : 0.6 },
102
- "red-pajama-incite-7b-instruct" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 128 },
103
- "zephyr-7b-beta" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 64 , "ratio" : 0.6 },
104
- "llama-2-7b" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.6 },
105
- "llama-2-7b-chat" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.8 },
106
- "llama-2-13b-chat" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 64 , "ratio" : 0.8 },
107
- "stablelm-3b-4e1t" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 64 , "ratio" : 0.8 },
108
- "stablelm-epoch-3b-preview" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 64 , "ratio" : 0.8 },
109
- "stable-zephyr-3b-dpo" : {"mode" : nncf .CompressWeightsMode .INT4_ASYM , "group_size" : 64 , "ratio" : 0.8 },
110
- "rocket-3b" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.8 },
111
- "chatglm2-6b" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.72 },
112
- "qwen-7b-chat" : {"mode" : nncf .CompressWeightsMode .INT4_SYM , "group_size" : 128 , "ratio" : 0.6 },
113
- }
114
118
return DEFAULT_4BIT_CONFIGS .get (config .name_or_path , None )
115
119
116
120
@@ -146,6 +150,7 @@ def compress_decoder_weights(model, quantization_config: Union[OVWeightQuantizat
146
150
group_size = config .group_size ,
147
151
all_layers = config .all_layers ,
148
152
sensitivity_metric = config .sensitivity_metric ,
153
+ awq = config .awq ,
149
154
ignored_scope = config .ignored_scope ,
150
155
dataset = dataset ,
151
156
)
0 commit comments