Skip to content

Commit f61b7e8

Browse files
Add 'nf4_f8e5m2', 'int4_f8e5m2'; add backup precision
1 parent e6a30a3 commit f61b7e8

File tree

5 files changed

+89
-23
lines changed

5 files changed

+89
-23
lines changed

docs/source/openvino/export.mdx

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ Check out the help for more options:
3232
```text
3333
usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
3434
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}]
35-
[--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3}]
35+
[--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,int4_f8e4m3,int4_f8e5m2}]
3636
[--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
3737
[--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
3838
[--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -68,7 +68,7 @@ Optional arguments:
6868
on your local machine arbitrary code present in the model repository.
6969
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
7070
The weight format of the exported model.
71-
--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3}
71+
--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,int4_f8e4m3,int4_f8e5m2}
7272
Quantization precision mode. This is used for applying full model quantization including
7373
activations.
7474
--library {transformers,diffusers,timm,sentence_transformers,open_clip}

optimum/commands/export/openvino.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
7878
optional_group.add_argument(
7979
"--quant-mode",
8080
type=str,
81-
choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "int4_f8e4m3"],
81+
choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"],
8282
default=None,
8383
help=(
8484
"Quantization precision mode. This is used for applying full model quantization including activations. "
@@ -363,13 +363,13 @@ def run(self):
363363
"Dataset is required for full quantization. Please provide it with --dataset argument."
364364
)
365365

366-
if self.args.quant_mode in ["nf4_f8e4m3", "int4_f8e4m3"]:
366+
if self.args.quant_mode in ["nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"]:
367367
wc_config = prepare_wc_config(self.args, _DEFAULT_4BIT_CONFIG)
368-
weight_dtype_map = {"nf4_f8e4m3": "nf4", "int4_f8e4m3": "int4"}
369-
wc_config["dtype"] = weight_dtype_map[self.args.quant_mode]
368+
wc_dtype, q_dtype = self.args.quant_mode.split("_")
369+
wc_config["dtype"] = wc_dtype
370370

371371
q_config = prepare_q_config(self.args)
372-
q_config["dtype"] = "f8e4m3"
372+
q_config["dtype"] = q_dtype
373373

374374
quantization_config = {
375375
"weight_quantization_config": wc_config,

optimum/intel/openvino/configuration.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,9 @@ def post_init(self):
483483
"quantization algorithm is selected and compression ratio is 1.0."
484484
)
485485

486+
if self.dtype in ["int4", "int8"]:
487+
self.bits = 4 if self.dtype == "int4" else 8
488+
486489
if self.bits not in [4, 8]:
487490
raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")
488491

@@ -895,15 +898,24 @@ def __init__(
895898
"""
896899
if isinstance(weight_quantization_config, dict):
897900
weight_quantization_config = OVWeightQuantizationConfig.from_dict(weight_quantization_config)
901+
else:
902+
weight_quantization_config = weight_quantization_config.clone()
898903
self.weight_quantization_config = weight_quantization_config
904+
wqc = self.weight_quantization_config
899905

900906
if isinstance(full_quantization_config, dict):
901907
full_quantization_config = OVQuantizationConfig.from_dict(full_quantization_config)
908+
else:
909+
full_quantization_config = full_quantization_config.clone()
902910
self.full_quantization_config = full_quantization_config
911+
fqc = self.full_quantization_config
912+
913+
if fqc.dtype in ["f8e4m3", "f8e5m2"] and wqc.backup_precision is None:
914+
# TODO: remove once there is support for FP8 weight compression in NNCF
915+
wqc.backup_precision = "none"
903916

904917
# Pull dataset-related parameters from child configs. This is not the intended use case, but we process it just
905918
# in case user sets those parameters inside child configs only.
906-
wqc, fqc = self.weight_quantization_config, self.full_quantization_config
907919
num_samples = max((num_samples or 0, wqc.num_samples or 0, fqc.num_samples or 0)) or None
908920
dataset = dataset or wqc.dataset or fqc.dataset
909921
tokenizer = tokenizer or wqc.tokenizer or fqc.tokenizer

tests/openvino/test_exporters_cli.py

+21-9
Original file line numberDiff line numberDiff line change
@@ -149,36 +149,48 @@ class OVCLIExportTestCase(unittest.TestCase):
149149
"text-generation",
150150
"llama",
151151
"nf4_f8e4m3",
152-
"--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code",
152+
"--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code --ratio 0.5",
153153
[
154-
13,
154+
14,
155155
],
156156
[
157-
{"int8": 4, "nf4": 14},
157+
{"f8e4m3": 11, "nf4": 5},
158158
],
159159
),
160160
(
161161
"text-generation",
162162
"llama",
163-
"int4_f8e4m3",
164-
"--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code",
163+
"nf4_f8e5m2",
164+
"--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code --sym --ratio 0.5",
165165
[
166-
13,
166+
14,
167167
],
168168
[
169-
{"int8": 4, "int4": 28},
169+
{"f8e5m2": 11, "nf4": 5},
170170
],
171171
),
172172
(
173173
"text-generation",
174174
"llama",
175175
"int4_f8e4m3",
176-
"--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code --sym",
176+
"--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code --sym --ratio 0.5",
177+
[
178+
14,
179+
],
180+
[
181+
{"f8e4m3": 11, "int4": 5},
182+
],
183+
),
184+
(
185+
"text-generation",
186+
"llama",
187+
"int4_f8e5m2",
188+
"--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code",
177189
[
178190
13,
179191
],
180192
[
181-
{"int8": 4, "int4": 14},
193+
{"f8e5m2": 2, "int4": 28},
182194
],
183195
),
184196
]

tests/openvino/test_quantization.py

+48-6
Original file line numberDiff line numberDiff line change
@@ -138,16 +138,16 @@ class OVQuantizerTest(unittest.TestCase):
138138
OVModelForCausalLM,
139139
"llama",
140140
dict(
141-
weight_quantization_config=dict(bits=4, dtype="nf4", group_size=16, weight_only=True),
141+
weight_quantization_config=dict(bits=4, dtype="nf4", group_size=16, weight_only=True, ratio=0.5),
142142
full_quantization_config=dict(dtype="f8e4m3", weight_only=False),
143143
dataset="wikitext2",
144144
num_samples=1,
145145
),
146146
[
147-
13,
147+
14,
148148
],
149149
[
150-
{"int8": 4, "nf4": 14},
150+
{"f8e4m3": 11, "nf4": 5},
151151
],
152152
),
153153
(
@@ -158,6 +158,7 @@ class OVQuantizerTest(unittest.TestCase):
158158
bits=4,
159159
dtype="nf4",
160160
group_size=16,
161+
ratio=0.5,
161162
ignored_scope={"patterns": ["^__module.model.layers.0.self_attn"]},
162163
),
163164
full_quantization_config=OVQuantizationConfig(
@@ -171,23 +172,64 @@ class OVQuantizerTest(unittest.TestCase):
171172
7,
172173
],
173174
[
174-
{"int8": 4, "f8e4m3": 4, "nf4": 6},
175+
{"f8e4m3": 8, "nf4": 2},
175176
],
176177
),
177178
(
178179
OVModelForCausalLM,
179180
"llama",
180181
OVMixedQuantizationConfig(
181-
weight_quantization_config=OVWeightQuantizationConfig(bits=4, group_size=16),
182+
weight_quantization_config=OVWeightQuantizationConfig(
183+
bits=4,
184+
dtype="nf4",
185+
group_size=16,
186+
ratio=0.5,
187+
ignored_scope={"patterns": ["^__module.model.layers.0.self_attn"]},
188+
),
189+
full_quantization_config=OVQuantizationConfig(
190+
dtype="f8e5m2", ignored_scope={"patterns": ["^__module.model.layers.0.mlp"]}
191+
),
192+
ignored_scope={"patterns": ["^__module.model.layers.1.self_attn"]},
193+
dataset="wikitext2",
194+
num_samples=1,
195+
),
196+
[
197+
7,
198+
],
199+
[
200+
{"f8e5m2": 8, "nf4": 2},
201+
],
202+
),
203+
(
204+
OVModelForCausalLM,
205+
"llama",
206+
OVMixedQuantizationConfig(
207+
weight_quantization_config=OVWeightQuantizationConfig(bits=4, group_size=16, ratio=0.5),
182208
full_quantization_config=OVQuantizationConfig(dtype="f8e4m3"),
183209
dataset="wikitext2",
184210
num_samples=1,
185211
),
212+
[
213+
14,
214+
],
215+
[
216+
{"f8e4m3": 11, "int4": 10},
217+
],
218+
),
219+
(
220+
OVModelForCausalLM,
221+
"llama",
222+
OVMixedQuantizationConfig(
223+
weight_quantization_config=OVWeightQuantizationConfig(bits=4, group_size=16),
224+
full_quantization_config=OVQuantizationConfig(dtype="f8e5m2"),
225+
dataset="wikitext2",
226+
num_samples=1,
227+
),
186228
[
187229
13,
188230
],
189231
[
190-
{"int8": 4, "int4": 28},
232+
{"f8e5m2": 2, "int4": 28},
191233
],
192234
),
193235
]

0 commit comments

Comments
 (0)