Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NF4+F8E4M3 support #1127

Closed
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Initial commit
  • Loading branch information
nikita-malininn committed Jan 23, 2025
commit d9ef0e6458f32cbf7a6a6889b133bf3c24e61683
4 changes: 2 additions & 2 deletions docs/source/openvino/export.mdx
Original file line number Diff line number Diff line change
@@ -31,7 +31,7 @@ Check out the help for more options:

```text
usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}]
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3}]
[--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
[--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
[--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -67,7 +67,7 @@ Optional arguments:
on your local machine arbitrary code present in the model repository.
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
The weight format of the exported model.
--quant-mode {int8,f8e4m3,f8e5m2}
--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3}
Quantization precision mode. This is used for applying full model quantization including
activations.
--library {transformers,diffusers,timm,sentence_transformers,open_clip}
96 changes: 66 additions & 30 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
@@ -78,7 +78,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
optional_group.add_argument(
"--quant-mode",
type=str,
choices=["int8", "f8e4m3", "f8e5m2"],
choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3"],
default=None,
help=(
"Quantization precision mode. This is used for applying full model quantization including activations. "
@@ -307,7 +307,14 @@ def parse_args(parser: "ArgumentParser"):
def run(self):
from ...exporters.openvino.__main__ import infer_task, main_export, maybe_convert_tokenizers
from ...exporters.openvino.utils import save_preprocessors
from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIG, OVConfig, get_default_int4_config
from ...intel.openvino.configuration import (
_DEFAULT_4BIT_CONFIG,
OVCompressWeightsOptions,
OVConfig,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New classes should be presented in the optimum.intel.__init__

OVGeneralQuantizationConfig,
OVQuantizeOptions,
get_default_int4_config,
)

if self.args.library is None:
# TODO: add revision, subfolder and token to args
@@ -342,43 +349,39 @@ def run(self):
if no_compression_parameter_provided(self.args) and self.args.weight_format == "int4":
quantization_config = get_default_int4_config(self.args.model)
else:
is_int8 = self.args.weight_format == "int8"
quantization_config = {
"bits": 8 if is_int8 else 4,
"ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
"sym": self.args.sym or False,
"group_size": -1 if is_int8 else self.args.group_size,
"all_layers": None if is_int8 else self.args.all_layers,
"dataset": self.args.dataset,
"num_samples": self.args.num_samples,
"quant_method": "awq" if self.args.awq else "default",
"sensitivity_metric": self.args.sensitivity_metric,
"scale_estimation": self.args.scale_estimation,
"gptq": self.args.gptq,
"lora_correction": self.args.lora_correction,
"weight_format": self.args.weight_format,
"backup_precision": self.args.backup_precision,
}
quantization_config = prepare_for_wc_config(self.args, _DEFAULT_4BIT_CONFIG)

if quantization_config.get("dataset", None) is not None:
quantization_config["trust_remote_code"] = self.args.trust_remote_code
ov_config = OVConfig(quantization_config=quantization_config)
else:
elif self.args.quant_mode is not None:
if self.args.dataset is None:
raise ValueError(
"Dataset is required for full quantization. Please provide it with --dataset argument."
)

quantization_config = {
"weight_format": self.args.quant_mode,
"activation_format": self.args.quant_mode,
"bits": 8,
"sym": self.args.sym or False,
"dataset": self.args.dataset,
"num_samples": self.args.num_samples,
"smooth_quant_alpha": self.args.smooth_quant_alpha,
"trust_remote_code": self.args.trust_remote_code,
}
if self.args.quant_mode == "nf4_f8e4m3":
wc_config = prepare_for_wc_config(self.args, _DEFAULT_4BIT_CONFIG)
wc_config["weight_format"] = "nf4"
cw_options = OVCompressWeightsOptions.init_with_format(**wc_config)

q_config = prepare_for_q_config(self.args)
q_config["activation_format"] = "f8e4m3"
q_options = OVQuantizeOptions.init_with_format(**q_config)

quantization_config = OVGeneralQuantizationConfig.init_with_format(
bits=8,
sym=self.args.sym,
ignored_scope=None,
num_samples=self.args.num_samples,
dataset=self.args.dataset,
trust_remote_code=self.args.trust_remote_code,
weight_format=self.args.weight_format,
)
quantization_config.compress_weights_options = cw_options
quantization_config.quantize_options = q_options
else:
quantization_config = prepare_for_q_config(self.args)
ov_config = OVConfig(quantization_config=quantization_config)

quantization_config = ov_config.quantization_config if ov_config else None
@@ -470,3 +473,36 @@ def run(self):
library_name=library_name,
# **input_shapes,
)


def prepare_for_wc_config(args, default_configs):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These methods are helpful! Would you consider renaming though? IMO this method prepares the config itself rather then something for the config. Same for prepare_for_q_config.

Suggested change
def prepare_for_wc_config(args, default_configs):
def prepare_wc_config(args, default_configs):

is_int8 = args.weight_format == "int8"
return {
"bits": 8 if is_int8 else 4,
"ratio": 1 if is_int8 else (args.ratio or default_configs["ratio"]),
"sym": args.sym or False,
"group_size": -1 if is_int8 else args.group_size,
"all_layers": None if is_int8 else args.all_layers,
"dataset": args.dataset,
"num_samples": args.num_samples,
"quant_method": "awq" if args.awq else "default",
"sensitivity_metric": args.sensitivity_metric,
"scale_estimation": args.scale_estimation,
"gptq": args.gptq,
"lora_correction": args.lora_correction,
"weight_format": args.weight_format,
"backup_precision": args.backup_precision,
}


def prepare_for_q_config(args):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def prepare_for_q_config(args):
def prepare_q_config(args):

return {
"weight_format": args.quant_mode,
"activation_format": args.quant_mode,
"bits": 8,
"sym": args.sym or False,
"dataset": args.dataset,
"num_samples": args.num_samples,
"smooth_quant_alpha": args.smooth_quant_alpha,
"trust_remote_code": args.trust_remote_code,
}
Loading
Loading