Skip to content

Commit 8a9dbb9

Browse files
committed
Support weight-only quantization with quantized operators in intel-extension-for-transformers
1 parent 86c4655 commit 8a9dbb9

File tree

5 files changed

+12
-17
lines changed

5 files changed

+12
-17
lines changed

examples/neural_compressor/language-modeling/run_clm.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
import torch
3434
import transformers
3535
from datasets import load_dataset
36-
from intel_extension_for_transformers.transformers.utils.quantization_config import WeightOnlyQuantConfig
36+
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
3737
from neural_compressor import (
3838
DistillationConfig,
3939
PostTrainingQuantConfig,
@@ -735,12 +735,7 @@ def compute_metrics(eval_preds):
735735
)
736736
trainer.model = quantizer._quantized_model
737737

738-
# TODO: Weight only quantization didn't support save/load function now. Will implement it soon.
739-
if (
740-
optim_args.apply_quantization
741-
and optim_args.verify_loading
742-
and optim_args.quantization_approach != "weight_only"
743-
):
738+
if optim_args.apply_quantization and optim_args.verify_loading:
744739
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
745740
tokens = tokenizer("This is a sample input", return_tensors="pt")
746741
with torch.no_grad():

optimum/intel/neural_compressor/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414

1515
from ..utils.import_utils import is_diffusers_available
16-
from .configuration import INCConfig
16+
from .configuration import INCConfig, WeightOnlyQuantConfig
1717
from .modeling_base import (
1818
INCModel,
1919
INCModelForMaskedLM,

optimum/intel/neural_compressor/configuration.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from typing import Dict, Optional, Union
1616

17+
from intel_extension_for_transformers.transformers.utils import WeightOnlyQuantConfig
1718
from neural_compressor.config import DistillationConfig, WeightPruningConfig, _BaseQuantizationConfig
1819

1920
from optimum.configuration_utils import BaseConfig
@@ -35,7 +36,7 @@ class INCConfig(BaseConfig):
3536

3637
def __init__(
3738
self,
38-
quantization=None,
39+
quantization: Optional[Union[Dict, _BaseQuantizationConfig, WeightOnlyQuantConfig]] = None,
3940
pruning: Optional[Union[Dict, _BaseQuantizationConfig]] = None,
4041
distillation: Optional[Union[Dict, _BaseQuantizationConfig]] = None,
4142
save_onnx_model: bool = False,
@@ -50,7 +51,7 @@ def __init__(
5051
self.save_onnx_model = save_onnx_model
5152

5253
@staticmethod
53-
def _create_quantization_config(config):
54+
def _create_quantization_config(config: Union[Dict, _BaseQuantizationConfig, WeightOnlyQuantConfig]):
5455
# TODO : add activations_dtype and weights_dtype
5556
if isinstance(config, _BaseQuantizationConfig):
5657
approach = _quantization_model[config.approach]

optimum/intel/neural_compressor/quantization.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@
5959
_ipex_version,
6060
_neural_compressor_version,
6161
is_intel_extension_for_transformers_available,
62-
is_intel_extension_for_transformers_version,
6362
is_ipex_version,
6463
is_neural_compressor_version,
6564
)
@@ -80,10 +79,7 @@
8079

8180
if is_intel_extension_for_transformers_available():
8281
from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model
83-
if is_intel_extension_for_transformers_version("<=", "1.2.2"):
84-
from intel_extension_for_transformers.transformers.utils.quantization_config import WeightOnlyQuantConfig
85-
else:
86-
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
82+
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
8783

8884
logger = logging.getLogger(__name__)
8985

@@ -186,7 +182,7 @@ def quantize(
186182
save_directory.mkdir(parents=True, exist_ok=True)
187183
save_onnx_model = kwargs.pop("save_onnx_model", False)
188184

189-
if save_onnx_model and isinstance(self._original_model, ORTModel):
185+
if save_onnx_model and (isinstance(self._original_model, ORTModel) or weight_only):
190186
save_onnx_model = False
191187
logger.warning("Model provided is an ONNX model, `save_onnx_model` is set to False")
192188

@@ -278,6 +274,9 @@ def quantize(
278274

279275
if isinstance(quantization_config, WeightOnlyQuantConfig):
280276
self._quantized_model = convert_to_quantized_model(self._original_model, quantization_config)
277+
# Save the quantized model
278+
output_path = save_directory.joinpath(file_name or default_name)
279+
self._quantized_model.save_pretrained(output_path)
281280
else:
282281
if isinstance(self._original_model.config, PretrainedConfig):
283282
self._original_model.config.backend = quantization_config.backend

tests/neural_compressor/test_optimization.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
INCSeq2SeqTrainer,
5757
INCStableDiffusionPipeline,
5858
)
59-
from intel_extension_for_transformers.transformers.utils.quantization_config import WeightOnlyQuantConfig
59+
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
6060
from optimum.intel.utils.constant import DIFFUSION_WEIGHTS_NAME
6161
from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification
6262
from optimum.pipelines import ORT_SUPPORTED_TASKS

0 commit comments

Comments
 (0)