-
Notifications
You must be signed in to change notification settings - Fork 126
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fp8 implementation #1100
Fp8 implementation #1100
Changes from 9 commits
c93c2e7
44f11a7
b54abf1
6f5cd5b
ac7b57a
2df7fc4
710f50a
3174ef0
022908a
0a8e3e7
83dbea7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,7 @@ | |
from optimum.configuration_utils import BaseConfig | ||
|
||
from ..utils.import_utils import is_nncf_available | ||
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_SPEECH_TO_TEXT_DATASETS, PREDEFINED_VISUAL_LM_DATASETS | ||
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS | ||
|
||
|
||
if is_nncf_available(): | ||
|
@@ -638,9 +638,9 @@ def __init__( | |
SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and | ||
reduces quantization error. | ||
weight_format (`str`, defaults to "int8"): | ||
Data format weights are quantized to. Possible values: ['int8']. | ||
Data format weights are quantized to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. | ||
activation_format (`str`, defaults to "int8"): | ||
Data format activations are compressed to. Possible values: ['int8']. | ||
Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2']. | ||
""" | ||
super().__init__( | ||
bits=bits, | ||
|
@@ -669,23 +669,20 @@ def post_init(self): | |
if self.bits != 8: | ||
raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}") | ||
|
||
if self.dataset is not None: | ||
if self.dataset not in PREDEFINED_SPEECH_TO_TEXT_DATASETS: | ||
raise ValueError( | ||
f"You have entered the following string value for dataset: {self.dataset}. But it is not supported." | ||
f" Currently you can only choose {list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())}." | ||
) | ||
|
||
if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1): | ||
raise ValueError( | ||
f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}" | ||
) | ||
|
||
if self.weight_format != "int8": | ||
raise ValueError("Only 'int8' weight format is currently supported.") | ||
|
||
if self.activation_format != "int8": | ||
raise ValueError("Only 'int8' activation format is currently supported.") | ||
if not self.sym: | ||
if self.activation_format != "int8": | ||
raise ValueError( | ||
f"Asymmetric quantization can not be performed in {self.activation_format} activation format." | ||
) | ||
if self.weight_format != "int8": | ||
raise ValueError( | ||
f"Asymmetric quantization can not be performed in {self.weight_format} weight format." | ||
) | ||
|
||
|
||
class OVConfig(BaseConfig): | ||
|
@@ -713,8 +710,6 @@ def __init__( | |
if self.quantization_config is not None: | ||
if isinstance(self.quantization_config, OVWeightQuantizationConfig): | ||
self.dtype = self.quantization_config.weight_format | ||
else: | ||
self.dtype = "int8" | ||
else: | ||
self.dtype = dtype | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this should be changed to:
|
||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -114,10 +114,19 @@ class OVCLIExportTestCase(unittest.TestCase): | |||||
( | ||||||
"automatic-speech-recognition", | ||||||
"whisper", | ||||||
"--quant-mode int8 --dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", | ||||||
"int8", | ||||||
"--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", | ||||||
(14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25), | ||||||
(14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18), | ||||||
), | ||||||
( | ||||||
"text-generation", | ||||||
"llama", | ||||||
"f8e4m3", | ||||||
Comment on lines
+128
to
+129
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do I understand correctly that applying quantization to language models is the intended use case for fp8 quantization? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know what the purpose of the fp8 usage is. The ticket says about LLM & diffusers at least. |
||||||
"--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code --sym", | ||||||
(13,), | ||||||
(16,), | ||||||
), | ||||||
] | ||||||
|
||||||
TEST_4BIT_CONFIGURATIONS = [ | ||||||
|
@@ -407,26 +416,27 @@ def test_exporters_cli_full_quantization( | |||||
self, | ||||||
task: str, | ||||||
model_type: str, | ||||||
quant_mode: str, | ||||||
option: str, | ||||||
expected_num_fq_nodes_per_model: Tuple[int], | ||||||
expected_num_weight_nodes_per_model: Tuple[int], | ||||||
): | ||||||
with TemporaryDirectory() as tmpdir: | ||||||
subprocess.run( | ||||||
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} {option} {tmpdir}", | ||||||
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --quant-mode {quant_mode} {option} {tmpdir}", | ||||||
shell=True, | ||||||
check=True, | ||||||
) | ||||||
model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(tmpdir) | ||||||
|
||||||
submodels = [] | ||||||
models = [model] | ||||||
if task == "automatic-speech-recognition": | ||||||
submodels = [model.encoder, model.decoder, model.decoder_with_past] | ||||||
self.assertEqual(len(expected_num_fq_nodes_per_model), len(submodels)) | ||||||
for i, model in enumerate(submodels): | ||||||
actual_num_fq_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model) | ||||||
self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_fq_nodes) | ||||||
self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes["int8"]) | ||||||
models = [model.encoder, model.decoder, model.decoder_with_past] | ||||||
self.assertEqual(len(expected_num_fq_nodes_per_model), len(models)) | ||||||
for i, model in enumerate(models): | ||||||
actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model) | ||||||
self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_f_nodes) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode]) | ||||||
|
||||||
def test_exporters_cli_int4_with_local_model_and_default_config(self): | ||||||
with TemporaryDirectory() as tmpdir: | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -202,31 +202,31 @@ | |
|
||
|
||
def get_num_quantized_nodes(model): | ||
num_fake_quantize = 0 | ||
num_weight_nodes = { | ||
"int8": 0, | ||
"int4": 0, | ||
"f4e2m1": 0, | ||
"f8e8m0": 0, | ||
"nf4": 0, | ||
num_fake_nodes = 0 | ||
types_map = { | ||
"i8": "int8", | ||
Comment on lines
+210
to
+211
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
"u8": "int8", | ||
"i4": "int4", | ||
"u4": "int4", | ||
"f4e2m1": "f4e2m1", | ||
"f8e8m0": "f8e8m0", | ||
"nf4": "nf4", | ||
"f8e4m3": "f8e4m3", | ||
"f8e5m2": "f8e5m2", | ||
} | ||
num_weight_nodes = {n: 0 for n in types_map.values()} | ||
ov_model = model if isinstance(model, ov.Model) else model.model | ||
for elem in ov_model.get_ops(): | ||
if "FakeQuantize" in elem.name: | ||
num_fake_quantize += 1 | ||
num_fake_nodes += 1 | ||
if "FakeConvert" in elem.name: | ||
num_fake_nodes += 1 | ||
for i in range(elem.get_output_size()): | ||
type_name = elem.get_output_element_type(i).get_type_name() | ||
if type_name in ["i8", "u8"]: | ||
num_weight_nodes["int8"] += 1 | ||
if type_name in ["i4", "u4"]: | ||
num_weight_nodes["int4"] += 1 | ||
if type_name == "f4e2m1": | ||
num_weight_nodes["f4e2m1"] += 1 | ||
if type_name == "f8e8m0": | ||
num_weight_nodes["f8e8m0"] += 1 | ||
if type_name == "nf4": | ||
num_weight_nodes["nf4"] += 1 | ||
return num_fake_quantize, num_weight_nodes | ||
if type_name in types_map: | ||
name = types_map[type_name] | ||
num_weight_nodes[name] += 1 | ||
return num_fake_nodes, num_weight_nodes | ||
|
||
|
||
@contextmanager | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would suggest to initialize
sym
asTrue
insideOVQuantizatioConfig
constructor iffp8
mode is selected. This option is intended to be used withint
data types and does not quite make sense withfp8
data types. Also, this way--sym
won't be needed to be specified every timefp8
modes are used.cc @AlexKoff88