Skip to content

Commit 569fe61

Browse files
Add int4_f8e4m3 quant mode
1 parent 50c77bf commit 569fe61

File tree

4 files changed

+47
-5
lines changed

4 files changed

+47
-5
lines changed

docs/source/openvino/export.mdx

+3-2
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ Check out the help for more options:
3131

3232
```text
3333
usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
34-
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3}]
34+
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}]
35+
[--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3}]
3536
[--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
3637
[--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
3738
[--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -67,7 +68,7 @@ Optional arguments:
6768
on your local machine arbitrary code present in the model repository.
6869
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
6970
The weight format of the exported model.
70-
--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3}
71+
--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,int4_f8e4m3}
7172
Quantization precision mode. This is used for applying full model quantization including
7273
activations.
7374
--library {transformers,diffusers,timm,sentence_transformers,open_clip}

optimum/commands/export/openvino.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
7878
optional_group.add_argument(
7979
"--quant-mode",
8080
type=str,
81-
choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3"],
81+
choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "int4_f8e4m3"],
8282
default=None,
8383
help=(
8484
"Quantization precision mode. This is used for applying full model quantization including activations. "
@@ -359,9 +359,10 @@ def run(self):
359359
"Dataset is required for full quantization. Please provide it with --dataset argument."
360360
)
361361

362-
if self.args.quant_mode == "nf4_f8e4m3":
362+
if self.args.quant_mode in ["nf4_f8e4m3", "int4_f8e4m3"]:
363363
wc_config = prepare_wc_config(self.args, _DEFAULT_4BIT_CONFIG)
364-
wc_config["dtype"] = "nf4"
364+
weight_dtype_map = {"nf4_f8e4m3": "nf4", "int4_f8e4m3": "int4"}
365+
wc_config["dtype"] = weight_dtype_map[self.args.quant_mode]
365366

366367
q_config = prepare_q_config(self.args)
367368
q_config["dtype"] = "f8e4m3"

tests/openvino/test_exporters_cli.py

+24
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,30 @@ class OVCLIExportTestCase(unittest.TestCase):
157157
{"int8": 4, "nf4": 14},
158158
],
159159
),
160+
(
161+
"text-generation",
162+
"llama",
163+
"int4_f8e4m3",
164+
"--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code",
165+
[
166+
13,
167+
],
168+
[
169+
{"int8": 4, "int4": 28},
170+
],
171+
),
172+
(
173+
"text-generation",
174+
"llama",
175+
"int4_f8e4m3",
176+
"--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --group-size 16 --trust-remote-code --sym",
177+
[
178+
13,
179+
],
180+
[
181+
{"int8": 4, "int4": 14},
182+
],
183+
),
160184
]
161185

162186
TEST_4BIT_CONFIGURATIONS = [

tests/openvino/test_quantization.py

+16
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,22 @@ class OVQuantizerTest(unittest.TestCase):
174174
{"int8": 4, "f8e4m3": 4, "nf4": 6},
175175
],
176176
),
177+
(
178+
OVModelForCausalLM,
179+
"llama",
180+
OVMixedQuantizationConfig(
181+
weight_quantization_config=OVWeightQuantizationConfig(bits=4, group_size=16),
182+
full_quantization_config=OVQuantizationConfig(dtype="f8e4m3"),
183+
dataset="wikitext2",
184+
num_samples=1,
185+
),
186+
[
187+
13,
188+
],
189+
[
190+
{"int8": 4, "int4": 28},
191+
],
192+
),
177193
]
178194

179195
@parameterized.expand(SUPPORTED_ARCHITECTURES_TORCH_MODEL)

0 commit comments

Comments
 (0)