Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OV] Move data-driven quantization after model export for text-generation models #721

Merged
merged 29 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
56878bb
Add quantization with dataset after model export for text-generation …
nikita-savelyevv May 21, 2024
013a0f6
Tweak AWQ CLI interface
nikita-savelyevv May 21, 2024
c566ccc
Additional checks
nikita-savelyevv May 21, 2024
0a8fba0
Fix
nikita-savelyevv May 21, 2024
6dbb4fe
Trigger Build
nikita-savelyevv May 21, 2024
3722624
Add AWQ description
nikita-savelyevv May 22, 2024
dee582d
Add trust remote code argument
nikita-savelyevv May 22, 2024
a44c096
Black
nikita-savelyevv May 22, 2024
12dc672
Add note about possibility of skipping AWQ
nikita-savelyevv May 22, 2024
bcc4665
Removed saving to temporary directory; added core property handling f…
nikita-savelyevv May 23, 2024
40058da
Revert "Removed saving to temporary directory; added core property ha…
nikita-savelyevv May 23, 2024
0886f7e
Add saving intermediate weights in fp16; add removal of intermediate …
nikita-savelyevv May 23, 2024
ee9b1b7
Trigger checks
nikita-savelyevv May 23, 2024
cb57068
Trigger checks
nikita-savelyevv May 24, 2024
ee0b67f
Trigger checks
nikita-savelyevv May 28, 2024
cacbb36
Fix test
nikita-savelyevv May 31, 2024
814d96c
Refactor applying quantization with dataset
nikita-savelyevv May 31, 2024
d8017ab
Bring back quantization_config parameter
nikita-savelyevv May 31, 2024
24272dc
Trigger checks
nikita-savelyevv May 31, 2024
40b0e29
Apply comment
nikita-savelyevv Jun 3, 2024
f54aa40
Save tokenizer
nikita-savelyevv Jun 4, 2024
96bed29
Export CausalLM tokenizer
nikita-savelyevv Jun 4, 2024
a6005ad
Remove unneccessary if
nikita-savelyevv Jun 4, 2024
e311916
Remove extra variable
nikita-savelyevv Jun 4, 2024
fc44214
ruff
nikita-savelyevv Jun 4, 2024
709085b
Ruff 2
nikita-savelyevv Jun 4, 2024
a2084d9
Introduce a separate function to tokenizer conversion
nikita-savelyevv Jun 5, 2024
e8cc0e9
Black
nikita-savelyevv Jun 5, 2024
6815773
Merge branch 'main' into cli-awq
echarlaix Jun 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 61 additions & 2 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import logging
import sys
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, Optional

Expand Down Expand Up @@ -128,6 +129,29 @@ def parse_args_openvino(parser: "ArgumentParser"):
"compression is applied, they are compressed to INT8."
),
)
optional_group.add_argument(
"--quant-method",
type=str,
default=None,
choices=["default", "awq", "hybrid"],
help=("The quantization method to apply. Can be one of the following: ['default', 'awq', 'hybrid']."),
)
optional_group.add_argument(
"--sensitivity-metric",
type=str,
default=None,
help=(
"The sensitivity metric for assigning quantization precision to layers. Can be one of the following: "
"['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', "
"'max_activation_variance', 'mean_activation_magnitude']."
),
)
optional_group.add_argument(
"--num-samples",
type=int,
default=None,
help=("The maximum number of samples composing the calibration dataset for quantization."),
)
optional_group.add_argument(
"--disable-stateful",
action="store_true",
Expand Down Expand Up @@ -180,7 +204,7 @@ def parse_args(parser: "ArgumentParser"):
return parse_args_openvino(parser)

def run(self):
from ...exporters.openvino.__main__ import main_export
from ...exporters.openvino.__main__ import main_export, infer_task
from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig

if self.args.fp16:
Expand Down Expand Up @@ -208,6 +232,9 @@ def run(self):
and self.args.group_size is None
and self.args.sym is None
and self.args.all_layers is None
and self.args.dataset is None
and self.args.quant_method is None
and self.args.sensitivity_metric is None
and self.args.model in _DEFAULT_4BIT_CONFIGS
):
quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model]
Expand All @@ -218,6 +245,10 @@ def run(self):
"sym": self.args.sym or False,
"group_size": -1 if is_int8 else self.args.group_size,
"all_layers": None if is_int8 else self.args.all_layers,
"dataset": self.args.dataset,
"num_samples": self.args.num_samples,
"quant_method": self.args.quant_method,
"sensitivity_metric": self.args.sensitivity_metric,
}

if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
Expand All @@ -226,7 +257,6 @@ def run(self):
)
quantization_config["sym"] = "asym" not in self.args.weight_format
quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
quantization_config["dataset"] = self.args.dataset
ov_config = OVConfig(quantization_config=quantization_config)

library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library)
Expand Down Expand Up @@ -290,6 +320,19 @@ def run(self):
if tokenizer_2 is not None:
export_tokenizer(tokenizer_2, output / "tokenizer_2")
else:
task = infer_task(self.args.task, self.args.model)
quantization_config = ov_config.quantization_config
quantize_after_export = (
task.startswith("text-generation")
and quantization_config is not None
and hasattr(quantization_config, "dataset")
and quantization_config.dataset is not None
)
if quantize_after_export:
# In order to quantize a text-generation model with a dataset, an instance of OVModelForCausalLM is
# required. That's why the quantization is skipped during export and applied explicitly after export.
ov_config.quantization_config = None

# TODO : add input shapes
main_export(
model_name_or_path=self.args.model,
Expand All @@ -305,3 +348,19 @@ def run(self):
library_name=library_name,
# **input_shapes,
)

if quantize_after_export:
from optimum.intel import OVModelForCausalLM, OVQuantizer

model = OVModelForCausalLM.from_pretrained(self.args.output)
quantizer = OVQuantizer(model)
quantization_config.tokenizer = quantization_config.tokenizer or str(self.args.output)
# TODO: set save_directory=self.args.output once OV is updated to 2024.3
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
with tempfile.TemporaryDirectory() as temp_dir:
import shutil

model.save_pretrained(temp_dir)
ov_config.save_pretrained(self.args.output)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nikita-savelyevv, does this workaround with tmp folder mean that we cannot save the model at the same path but can copy files there? Looks a bit strange.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I had troubles saving to the same location, but copying to that location works fine

shutil.copy(f"{temp_dir}/openvino_model.xml", f"{self.args.output}/openvino_model.xml")
shutil.copy(f"{temp_dir}/openvino_model.bin", f"{self.args.output}/openvino_model.bin")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had to add this workaround because OpenVINO does not currently support saving into the same location where the model is loaded from (ticket 110054). This is expected to be fixed in OV 2024.3.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eaidova, please take a look

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we can intorduce model_name parameter for from_pretrained/save_pretrained methods? That will allow having both models in the same dir (in the same time it maybe useful for loading model if IR saved by different tools or renamed). Or we can try disable mmap via ov_config for now (it should help with saving in the same location)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eaidova thank you for your suggestion! I've replaced saving to temporary directory with disabling mmap

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eaidova thank you for your suggestion! I've replaced saving to temporary directory with disabling mmap

For some reason when doing it this way I observe that a significant amount of additional memory is allocated. The amount roughly equals the model size which is rather significant. I guess I'll revert these changes for now.

30 changes: 17 additions & 13 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,22 @@
logger = logging.getLogger(__name__)


def infer_task(task, model_name_or_path):
task = TasksManager.map_from_synonym(task)
if task == "auto":
try:
task = TasksManager.infer_task_from_model(model_name_or_path)
except KeyError as e:
raise KeyError(
f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)
except RequestsConnectionError as e:
raise RequestsConnectionError(
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)
return task


def main_export(
model_name_or_path: str,
output: Union[str, Path],
Expand Down Expand Up @@ -174,7 +190,7 @@ def main_export(
ov_config = OVConfig(quantization_config=q_config)

original_task = task
task = TasksManager.map_from_synonym(task)
task = infer_task(task, model_name_or_path)
framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
library_name_is_not_provided = library_name is None
library_name = TasksManager.infer_library_from_model(
Expand All @@ -188,18 +204,6 @@ def main_export(
)
library_name = "transformers"

if task == "auto":
try:
task = TasksManager.infer_task_from_model(model_name_or_path)
except KeyError as e:
raise KeyError(
f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)
except RequestsConnectionError as e:
raise RequestsConnectionError(
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)

do_gptq_patching = False
custom_architecture = False
loading_kwargs = {}
Expand Down
12 changes: 11 additions & 1 deletion tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,14 @@ class OVCLIExportTestCase(unittest.TestCase):
("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86),
("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86),
("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32),
(
"text-generation-with-past",
"llama_awq",
"int4 --ratio 1.0 --sym --group-size 16 --quant-method awq --dataset wikitext2 --num-samples 100 "
"--sensitivity-metric max_activation_variance",
4,
28,
),
]

def _openvino_export(
Expand Down Expand Up @@ -197,17 +205,19 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
@parameterized.expand(TEST_4BIT_CONFIGURATONS)
def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int):
with TemporaryDirectory() as tmpdir:
subprocess.run(
result = subprocess.run(
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
shell=True,
check=True,
capture_output=True,
)
model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {}
model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs)

_, num_int8, num_int4 = get_num_quantized_nodes(model)
self.assertEqual(expected_int8, num_int8)
self.assertEqual(expected_int4, num_int4)
self.assertTrue("--quant-method awq" not in option or b"Applying AWQ" in result.stdout)

def test_exporters_cli_help(self):
subprocess.run(
Expand Down
Loading