-
Notifications
You must be signed in to change notification settings - Fork 124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OV] Move data-driven quantization after model export for text-generation models #721
Changes from 1 commit
56878bb
013a0f6
c566ccc
0a8fba0
6dbb4fe
3722624
dee582d
a44c096
12dc672
bcc4665
40058da
0886f7e
ee9b1b7
cb57068
ee0b67f
cacbb36
814d96c
d8017ab
24272dc
40b0e29
f54aa40
96bed29
a6005ad
e311916
fc44214
709085b
a2084d9
e8cc0e9
6815773
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
|
||
import logging | ||
import sys | ||
import tempfile | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING, Optional | ||
|
||
|
@@ -128,6 +129,29 @@ def parse_args_openvino(parser: "ArgumentParser"): | |
"compression is applied, they are compressed to INT8." | ||
), | ||
) | ||
optional_group.add_argument( | ||
"--quant-method", | ||
type=str, | ||
default=None, | ||
choices=["default", "awq", "hybrid"], | ||
help=("The quantization method to apply. Can be one of the following: ['default', 'awq', 'hybrid']."), | ||
) | ||
optional_group.add_argument( | ||
"--sensitivity-metric", | ||
type=str, | ||
default=None, | ||
help=( | ||
"The sensitivity metric for assigning quantization precision to layers. Can be one of the following: " | ||
"['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', " | ||
"'max_activation_variance', 'mean_activation_magnitude']." | ||
), | ||
) | ||
optional_group.add_argument( | ||
"--num-samples", | ||
type=int, | ||
default=None, | ||
help=("The maximum number of samples composing the calibration dataset for quantization."), | ||
) | ||
optional_group.add_argument( | ||
"--disable-stateful", | ||
action="store_true", | ||
|
@@ -180,7 +204,7 @@ def parse_args(parser: "ArgumentParser"): | |
return parse_args_openvino(parser) | ||
|
||
def run(self): | ||
from ...exporters.openvino.__main__ import main_export | ||
from ...exporters.openvino.__main__ import main_export, infer_task | ||
from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig | ||
|
||
if self.args.fp16: | ||
|
@@ -208,6 +232,9 @@ def run(self): | |
and self.args.group_size is None | ||
and self.args.sym is None | ||
and self.args.all_layers is None | ||
and self.args.dataset is None | ||
and self.args.quant_method is None | ||
and self.args.sensitivity_metric is None | ||
and self.args.model in _DEFAULT_4BIT_CONFIGS | ||
): | ||
quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model] | ||
|
@@ -218,6 +245,10 @@ def run(self): | |
"sym": self.args.sym or False, | ||
"group_size": -1 if is_int8 else self.args.group_size, | ||
"all_layers": None if is_int8 else self.args.all_layers, | ||
"dataset": self.args.dataset, | ||
"num_samples": self.args.num_samples, | ||
"quant_method": self.args.quant_method, | ||
"sensitivity_metric": self.args.sensitivity_metric, | ||
} | ||
|
||
if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}: | ||
|
@@ -226,7 +257,6 @@ def run(self): | |
) | ||
quantization_config["sym"] = "asym" not in self.args.weight_format | ||
quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64 | ||
quantization_config["dataset"] = self.args.dataset | ||
ov_config = OVConfig(quantization_config=quantization_config) | ||
|
||
library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library) | ||
|
@@ -290,6 +320,19 @@ def run(self): | |
if tokenizer_2 is not None: | ||
export_tokenizer(tokenizer_2, output / "tokenizer_2") | ||
else: | ||
task = infer_task(self.args.task, self.args.model) | ||
quantization_config = ov_config.quantization_config | ||
quantize_after_export = ( | ||
task.startswith("text-generation") | ||
and quantization_config is not None | ||
and hasattr(quantization_config, "dataset") | ||
and quantization_config.dataset is not None | ||
) | ||
if quantize_after_export: | ||
# In order to quantize a text-generation model with a dataset, an instance of OVModelForCausalLM is | ||
# required. That's why the quantization is skipped during export and applied explicitly after export. | ||
ov_config.quantization_config = None | ||
|
||
# TODO : add input shapes | ||
main_export( | ||
model_name_or_path=self.args.model, | ||
|
@@ -305,3 +348,19 @@ def run(self): | |
library_name=library_name, | ||
# **input_shapes, | ||
) | ||
|
||
if quantize_after_export: | ||
from optimum.intel import OVModelForCausalLM, OVQuantizer | ||
|
||
model = OVModelForCausalLM.from_pretrained(self.args.output) | ||
quantizer = OVQuantizer(model) | ||
quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output) | ||
# TODO: set save_directory=self.args.output once OV is updated to 2024.3 | ||
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) | ||
with tempfile.TemporaryDirectory() as temp_dir: | ||
import shutil | ||
|
||
model.save_pretrained(temp_dir) | ||
ov_config.save_pretrained(self.args.output) | ||
shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml") | ||
shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Had to add this workaround because OpenVINO does not currently support saving into the same location where the model is loaded from (ticket 110054). This is expected to be fixed in OV 2024.3. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @eaidova, please take a look There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we can intorduce model_name parameter for from_pretrained/save_pretrained methods? That will allow having both models in the same dir (in the same time it maybe useful for loading model if IR saved by different tools or renamed). Or we can try disable mmap via ov_config for now (it should help with saving in the same location) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @eaidova thank you for your suggestion! I've replaced saving to temporary directory with disabling mmap There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For some reason when doing it this way I observe that a significant amount of additional memory is allocated. The amount roughly equals the model size which is rather significant. I guess I'll revert these changes for now. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@nikita-savelyevv, does this workaround with tmp folder mean that we cannot save the model at the same path but can copy files there? Looks a bit strange.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I had troubles saving to the same location, but copying to that location works fine