Skip to content

Commit 00cd903

Browse files
committed
Update code with comments
Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
1 parent f68486b commit 00cd903

File tree

5 files changed

+58
-10
lines changed

5 files changed

+58
-10
lines changed

examples/neural_compressor/language-modeling/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ torch >= 1.9
33
datasets >= 1.8.0
44
sentencepiece != 0.1.92
55
protobuf
6-
intel-extension-for-transformers >=1.3
6+
intel-extension-for-transformers >= 1.3
77
peft

examples/neural_compressor/language-modeling/run_clm.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import torch
3434
import transformers
3535
from datasets import load_dataset
36-
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
3736
from neural_compressor import (
3837
DistillationConfig,
3938
PostTrainingQuantConfig,
@@ -58,7 +57,10 @@
5857
from transformers.utils.versions import require_version
5958

6059
from optimum.intel.neural_compressor import INCModelForCausalLM, INCQuantizer, INCTrainer
60+
from optimum.intel.utils.import_utils import is_intel_extension_for_transformers_available
6161

62+
if is_intel_extension_for_transformers_available():
63+
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
6264

6365
os.environ["CUDA_VISIBLE_DEVICES"] = ""
6466

@@ -626,6 +628,11 @@ def compute_metrics(eval_preds):
626628
else:
627629
recipes = {}
628630
if optim_args.quantization_approach == "weight_only":
631+
if not is_intel_extension_for_transformers_available():
632+
raise ImportError(
633+
"Didn't find out intel-etension-for-transformers package. "
634+
"Please install packages: pip install intel-etension-for-transformers and pip install peft."
635+
)
629636
if optim_args.apply_pruning or optim_args.apply_distillation:
630637
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
631638
quantization_config = WeightOnlyQuantConfig(

optimum/intel/neural_compressor/quantization.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from enum import Enum
2020
from itertools import chain
2121
from pathlib import Path
22-
from typing import Callable, Dict, Optional, Union
22+
from typing import Callable, Dict, Optional, TypeAlias, Union
2323

2424
import torch
2525
from datasets import Dataset, load_dataset
@@ -80,6 +80,9 @@
8080
if is_intel_extension_for_transformers_available():
8181
from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model
8282
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
83+
Config: TypeAlias = Union[PostTrainingQuantConfig, WeightOnlyQuantConfig]
84+
else:
85+
Config: TypeAlias = PostTrainingQuantConfig
8386

8487
logger = logging.getLogger(__name__)
8588

@@ -149,7 +152,7 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs):
149152
def quantize(
150153
self,
151154
save_directory: Union[str, Path],
152-
quantization_config=None,
155+
quantization_config: Config = None,
153156
calibration_dataset: Dataset = None,
154157
batch_size: int = 8,
155158
data_collator: Optional[DataCollator] = None,
@@ -162,7 +165,7 @@ def quantize(
162165
Quantize a model given the optimization specifications defined in `quantization_config`.
163166
164167
Args:
165-
quantization_config (`PostTrainingQuantConfig`):
168+
quantization_config (`Union[PostTrainingQuantConfig, WeightOnlyQuantConfig]`):
166169
The configuration containing the parameters related to quantization.
167170
save_directory (`Union[str, Path]`):
168171
The directory where the quantized model should be saved.
@@ -261,8 +264,7 @@ def quantize(
261264
save_onnx_model = False
262265

263266
if (
264-
not weight_only
265-
and not isinstance(quantization_config, WeightOnlyQuantConfig)
267+
isinstance(quantization_config, PostTrainingQuantConfig)
266268
and quantization_config.backend == "ipex"
267269
and is_ipex_version("<", IPEX_MINIMUM_VERSION)
268270
and "generation" in self.task
@@ -272,7 +274,7 @@ def quantize(
272274
f"but only version {IPEX_MINIMUM_VERSION} or higher is supported."
273275
)
274276

275-
if isinstance(quantization_config, WeightOnlyQuantConfig):
277+
if not isinstance(quantization_config, PostTrainingQuantConfig):
276278
self._quantized_model = convert_to_quantized_model(self._original_model, quantization_config)
277279
# Save the quantized model
278280
output_path = save_directory.joinpath(file_name or default_name)

optimum/intel/utils/import_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ def is_timm_version(operation: str, version: str):
350350

351351
INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR = """
352352
{0} requires the intel-extension-for-transformers library but it was not found in your environment. You can install it with pip:
353-
`pip install neural-compressor`. Please note that you may need to restart your runtime after installation.
353+
`pip install intel-extension-for-transformers`. Please note that you may need to restart your runtime after installation.
354354
"""
355355

356356
DATASETS_IMPORT_ERROR = """

tests/neural_compressor/test_optimization.py

+40-1
Original file line numberDiff line numberDiff line change
@@ -202,14 +202,43 @@ def test_ipex_static_quantization_with_smoothquant(self, task, model_name, expec
202202

203203
def test_weight_only_quantization(self):
204204
model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM"
205-
quantization_config = WeightOnlyQuantConfig(weight_dtype="int8")
206205
model = AutoModelForCausalLM.from_pretrained(model_name)
207206
tokenizer = AutoTokenizer.from_pretrained(model_name)
208207
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
209208
quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation")
210209
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2)
211210

212211
with tempfile.TemporaryDirectory() as tmp_dir:
212+
quantization_config = WeightOnlyQuantConfig(weight_dtype="int8")
213+
q_model = quantizer.quantize(
214+
quantization_config=quantization_config,
215+
save_directory=tmp_dir,
216+
)
217+
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
218+
out = model(inp)[0]
219+
q_out = q_model(inp)[0]
220+
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))
221+
222+
with tempfile.TemporaryDirectory() as tmp_dir:
223+
quantization_config = WeightOnlyQuantConfig(
224+
algorithm="GPTQ",
225+
weight_dtype="int4_clip",
226+
)
227+
q_model = quantizer.quantize(
228+
quantization_config=quantization_config,
229+
calibration_dataset=calibration_dataset,
230+
save_directory=tmp_dir,
231+
)
232+
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
233+
out = model(inp)[0]
234+
q_out = q_model(inp)[0]
235+
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))
236+
237+
with tempfile.TemporaryDirectory() as tmp_dir:
238+
quantization_config = WeightOnlyQuantConfig(
239+
algorithm="AWQ",
240+
weight_dtype="int4_clip",
241+
)
213242
q_model = quantizer.quantize(
214243
quantization_config=quantization_config,
215244
calibration_dataset=calibration_dataset,
@@ -220,6 +249,16 @@ def test_weight_only_quantization(self):
220249
q_out = q_model(inp)[0]
221250
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))
222251

252+
with tempfile.TemporaryDirectory() as tmp_dir:
253+
q_model = quantizer.quantize(
254+
weight_only=True, # use RTN quantization method and NF4 weight data type is default.
255+
save_directory=tmp_dir,
256+
)
257+
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
258+
out = model(inp)[0]
259+
q_out = q_model(inp)[0]
260+
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))
261+
223262
def test_dynamic_accuracy_strategy_quantization(self):
224263
model_name = "distilbert-base-cased-distilled-squad"
225264
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

0 commit comments

Comments
 (0)