Skip to content

Commit 0cc7c00

Browse files
committed
Fixed UT and examples error
Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
1 parent 3ca3f60 commit 0cc7c00

File tree

10 files changed

+117
-44
lines changed

10 files changed

+117
-44
lines changed

.github/workflows/test_inc.yml

+4-1
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,11 @@ jobs:
3030
- name: Install dependencies
3131
run: |
3232
python -m pip install --upgrade pip
33+
pip install cmake>=3.16
34+
pip install py-cpuinfo
35+
pip install torch==2.1.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu
3336
pip install .[neural-compressor,diffusers,tests]
34-
pip install intel-extension-for-pytorch
37+
pip install intel-extension-for-pytorch==2.1.100
3538
- name: Test with Pytest
3639
run: |
3740
pytest tests/neural_compressor/

examples/neural_compressor/language-modeling/run_clm.py

+48-12
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@
6363
if is_intel_extension_for_transformers_available():
6464
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
6565

66+
from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM
67+
6668
os.environ["CUDA_VISIBLE_DEVICES"] = ""
6769

6870
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -147,7 +149,9 @@ class OptimizationArguments:
147149
)
148150
quantization_approach: str = field(
149151
default="dynamic",
150-
metadata={"help": "Quantization approach. Supported approach are static, dynamic and aware_training."},
152+
metadata={
153+
"help": "Quantization approach. Supported approach are static, dynamic aware_training and weight_only."
154+
},
151155
)
152156
smooth_quant: bool = field(
153157
default=False,
@@ -200,8 +204,12 @@ class OptimizationArguments:
200204
default=False,
201205
metadata={"help": "Whether or not to verify the loading of the quantized model."},
202206
)
207+
bits: str = field(
208+
default="4",
209+
metadata={"help": "Bits number of weight for weight only quantization. 1~8 bits."},
210+
)
203211
weight_dtype: str = field(
204-
default="int8",
212+
default="int4_clip",
205213
metadata={"help": "weight dtype for weight only quantization."},
206214
)
207215
group_size: int = field(
@@ -218,9 +226,24 @@ class OptimizationArguments:
218226
)
219227
quantization_methodology: str = field(
220228
default="RTN",
221-
metadata={
222-
"help": "Quantization methodology for weight only quantization. Choose from 'RTN', 'AWQ' and 'GPTQ'."
223-
},
229+
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
230+
)
231+
gptq_percdamp: float = field(
232+
default=0.01,
233+
metadata={"help": "Percent of the average Hessian diagonal to use for dampening."},
234+
)
235+
gptq_block_size: int = field(
236+
default=128,
237+
metadata={"help": "Block size. sub weight matrix size to run GPTQ."},
238+
)
239+
gptq_nsamples: int = field(default=128, metadata={"help": "Number of calibration data samples."})
240+
gptq_use_max_length: bool = field(
241+
default=False,
242+
metadata={"help": "Set all sequence length to be same length of args.gptq_pad_max_length"},
243+
)
244+
gptq_pad_max_length: int = field(
245+
default=2048,
246+
metadata={"help": "Calibration dataset sequence max length, this should align with your model config"},
224247
)
225248

226249

@@ -636,11 +659,21 @@ def compute_metrics(eval_preds):
636659
)
637660
if optim_args.apply_pruning or optim_args.apply_distillation:
638661
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
662+
if optim_args.quantization_methodology == "GPTQ":
663+
algorithm_args = {
664+
"act_order": False,
665+
"percdamp": optim_args.gptq_percdamp,
666+
"block_size": optim_args.gptq_block_size,
667+
"nsamples": optim_args.gptq_nsamples,
668+
"use_max_length": optim_args.gptq_use_max_length,
669+
"pad_max_length": optim_args.gptq_pad_max_length,
670+
}
639671
quantization_config = WeightOnlyQuantConfig(
640672
weight_dtype=optim_args.weight_dtype,
641673
group_size=optim_args.group_size,
642674
scheme=optim_args.weight_only_scheme,
643675
algorithm=optim_args.quantization_methodology,
676+
algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
644677
)
645678
else:
646679
quantization_config = PostTrainingQuantConfig(
@@ -733,17 +766,20 @@ def compute_metrics(eval_preds):
733766
quantizer.quantize(
734767
quantization_config=quantization_config,
735768
save_directory=training_args.output_dir,
736-
calibration_dataset=train_dataset
737-
if optim_args.quantization_approach in ["static", "weight_only"]
738-
else None,
739-
batch_size=1
740-
if optim_args.quantization_approach == "weight_only"
741-
else training_args.per_device_train_batch_size,
769+
calibration_dataset=(
770+
train_dataset if optim_args.quantization_approach in ["static", "weight_only"] else None
771+
),
772+
batch_size=(
773+
1 if optim_args.quantization_approach == "weight_only" else training_args.per_device_train_batch_size
774+
),
742775
)
743776
trainer.model = quantizer._quantized_model
744777

745778
if optim_args.apply_quantization and optim_args.verify_loading:
746-
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
779+
if optim_args.quantization_approach == "weight_only":
780+
loaded_model = ITREXAutoModelForCausalLM.from_pretrained(training_args.output_dir)
781+
else:
782+
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
747783
tokens = tokenizer("This is a sample input", return_tensors="pt")
748784
with torch.no_grad():
749785
original_model_outputs = trainer.model(**tokens)

examples/neural_compressor/text-generation/run_generation.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,9 @@ def calibration_fn(p_model):
368368

369369
args.length = adjust_length_to_model(
370370
args.length,
371-
max_sequence_length=model.config.max_position_embeddings
372-
if hasattr(model.config, "max_position_embeddings")
373-
else 0,
371+
max_sequence_length=(
372+
model.config.max_position_embeddings if hasattr(model.config, "max_position_embeddings") else 0
373+
),
374374
)
375375
logger.info(args)
376376

optimum/intel/neural_compressor/__init__.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from ..utils.import_utils import is_diffusers_available
15+
from ..utils.import_utils import is_diffusers_available, is_intel_extension_for_transformers_available
1616
from .configuration import INCConfig
1717
from .modeling_base import (
1818
INCModel,
@@ -32,3 +32,7 @@
3232

3333
if is_diffusers_available():
3434
from .modeling_diffusion import INCStableDiffusionPipeline
35+
36+
37+
if is_intel_extension_for_transformers_available():
38+
from .modeling_base import ITREXAutoModelForCausalLM

optimum/intel/neural_compressor/configuration.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class INCConfig(BaseConfig):
3535

3636
def __init__(
3737
self,
38-
quantization: Optional[Union[Dict, _BaseQuantizationConfig, "WeightOnlyQuantConfig"]] = None,
38+
quantization: Optional[Union[Dict, _BaseQuantizationConfig]] = None,
3939
pruning: Optional[Union[Dict, _BaseQuantizationConfig]] = None,
4040
distillation: Optional[Union[Dict, _BaseQuantizationConfig]] = None,
4141
save_onnx_model: bool = False,
@@ -50,7 +50,7 @@ def __init__(
5050
self.save_onnx_model = save_onnx_model
5151

5252
@staticmethod
53-
def _create_quantization_config(config):
53+
def _create_quantization_config(config: Union[Dict, _BaseQuantizationConfig]):
5454
# TODO : add activations_dtype and weights_dtype
5555
if isinstance(config, _BaseQuantizationConfig):
5656
approach = _quantization_model[config.approach]

optimum/intel/neural_compressor/modeling_base.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from optimum.intel.generation import BaseModelForCausalLM
4444

4545
from ...modeling_base import OptimizedModel
46-
from ..utils.import_utils import _torch_version, is_torch_version
46+
from ..utils.import_utils import _torch_version, is_intel_extension_for_transformers_available, is_torch_version
4747
from .configuration import INCConfig
4848
from .utils import WEIGHTS_NAME
4949

@@ -63,6 +63,14 @@
6363
"""
6464

6565

66+
if is_intel_extension_for_transformers_available():
67+
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM as ITREX_WOQ_MODEL
68+
69+
class ITREXAutoModelForCausalLM(ITREX_WOQ_MODEL):
70+
auto_model_class = AutoModelForCausalLM
71+
export_feature = "text-generation"
72+
73+
6674
class INCModel(OptimizedModel):
6775
auto_model_class = AutoModel
6876
export_feature = "feature-extraction"

optimum/intel/neural_compressor/quantization.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import copy
1616
import inspect
1717
import logging
18+
import types
1819
import warnings
1920
from enum import Enum
2021
from itertools import chain
@@ -79,6 +80,7 @@
7980

8081
if is_intel_extension_for_transformers_available():
8182
from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model
83+
from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit
8284
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
8385

8486
Config = Union[PostTrainingQuantConfig, WeightOnlyQuantConfig]
@@ -185,6 +187,9 @@ def quantize(
185187
save_directory = Path(save_directory)
186188
save_directory.mkdir(parents=True, exist_ok=True)
187189
save_onnx_model = kwargs.pop("save_onnx_model", False)
190+
device = kwargs.pop("device", "cpu")
191+
use_cpu = True if device == torch.device("cpu") or device == "cpu" else False
192+
use_xpu = True if (isinstance(device, torch.device) and device.type == "xpu") or device == "xpu" else False
188193

189194
if save_onnx_model and (isinstance(self._original_model, ORTModel) or weight_only):
190195
save_onnx_model = False
@@ -217,7 +222,10 @@ def quantize(
217222
f"For weight-only quantization, `quantization_config` should be an instance of `WeightOnlyQuantConfig`, but got: {type(quantization_config)} instead."
218223
)
219224

220-
if calibration_dataset is None and ("GPTQ" in algo or "AWQ" in algo):
225+
if algo not in ["RTN", "GPTQ"]:
226+
raise ValueError("Weight-only quantization is only support RTN and GPTQ algorithm now!")
227+
228+
if calibration_dataset is None and quantization_config.tokenizer is None and ("GPTQ" in algo):
221229
raise ValueError(
222230
"Weight-only quantization needs a calibration dataset for both GPTQ and AWQ methodologies."
223231
)
@@ -278,10 +286,24 @@ def quantize(
278286
)
279287

280288
if not isinstance(quantization_config, PostTrainingQuantConfig):
281-
self._quantized_model = convert_to_quantized_model(self._original_model, quantization_config)
289+
if use_cpu:
290+
# will remove after intel-extension-for-transformers 1.3.3 released
291+
quantization_config.device = "cpu"
292+
quantization_config.post_init()
293+
elif use_xpu:
294+
# will remove after intel-extension-for-transformers 1.3.3 released
295+
quantization_config.device = "xpu"
296+
quantization_config.post_init_xpu()
297+
self._quantized_model = convert_to_quantized_model(
298+
self._original_model, quantization_config, device=quantization_config.device
299+
)
300+
# will remove after intel-extension-for-transformers 1.3.3 released
301+
if hasattr(quantization_config, "calib_dataloader"):
302+
quantization_config.calib_dataloader = None
303+
self._quantized_model.quantization_config = quantization_config
304+
self._quantized_model.save_pretrained = types.MethodType(save_low_bit, self._quantized_model)
282305
# Save the quantized model
283-
output_path = save_directory.joinpath(file_name or default_name)
284-
self._quantized_model.save_pretrained(output_path)
306+
self._quantized_model.save_pretrained(save_directory)
285307
else:
286308
if isinstance(self._original_model.config, PretrainedConfig):
287309
self._original_model.config.backend = quantization_config.backend

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
"rjieba",
3535
"timm",
3636
"invisible-watermark>=0.2.0",
37-
"cmake>=3.16",
37+
# Will remove after intel-extension-for-transformers 1.3.3 released.
3838
"intel-extension-for-transformers>=1.3",
3939
"peft",
4040
"auto-gptq",

tests/neural_compressor/test_optimization.py

+18-19
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
set_seed,
4646
)
4747
from utils_tests import SEED, INCTestMixin, _generate_dataset
48-
from optimum.intel.utils.import_utils import is_torch_version
48+
from optimum.intel.utils.import_utils import is_torch_version, is_intel_extension_for_transformers_available
4949

5050

5151
from optimum.intel import (
@@ -60,11 +60,13 @@
6060
INCSeq2SeqTrainer,
6161
INCStableDiffusionPipeline,
6262
)
63-
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
6463
from optimum.intel.utils.constant import DIFFUSION_WEIGHTS_NAME
6564
from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification
6665
from optimum.pipelines import ORT_SUPPORTED_TASKS
6766

67+
if is_intel_extension_for_transformers_available():
68+
from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM
69+
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
6870

6971
os.environ["CUDA_VISIBLE_DEVICES"] = ""
7072
set_seed(SEED)
@@ -200,63 +202,60 @@ def test_ipex_static_quantization_with_smoothquant(self, task, model_name, expec
200202
load_ipex_model=True,
201203
)
202204

205+
@unittest.skipIf(
206+
not is_intel_extension_for_transformers_available(), reason="Intel-extension-for-transformers not available!"
207+
)
203208
def test_weight_only_quantization(self):
204209
model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM"
205210
model = AutoModelForCausalLM.from_pretrained(model_name)
206211
tokenizer = AutoTokenizer.from_pretrained(model_name)
207212
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
208-
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2)
209213

210214
with tempfile.TemporaryDirectory() as tmp_dir:
211215
quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation")
216+
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2)
212217
quantization_config = WeightOnlyQuantConfig(weight_dtype="int8")
213218
q_model = quantizer.quantize(
214219
quantization_config=quantization_config,
215220
save_directory=tmp_dir,
216221
)
222+
q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir)
217223
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
218224
out = model(inp)[0]
219225
q_out = q_model(inp)[0]
220226
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))
221227

222228
with tempfile.TemporaryDirectory() as tmp_dir:
223229
quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation")
230+
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2)
224231
quantization_config = WeightOnlyQuantConfig(
225232
algorithm="GPTQ",
233+
algorithm_args={
234+
"percdamp": 0.01,
235+
"act_order": False,
236+
"scheme": "sym",
237+
},
226238
weight_dtype="int4_clip",
227239
)
228240
q_model = quantizer.quantize(
229241
quantization_config=quantization_config,
230242
calibration_dataset=calibration_dataset,
231243
save_directory=tmp_dir,
232244
)
245+
q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir)
233246
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
234247
out = model(inp)[0]
235248
q_out = q_model(inp)[0]
236249
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))
237250

238251
with tempfile.TemporaryDirectory() as tmp_dir:
239252
quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation")
240-
quantization_config = WeightOnlyQuantConfig(
241-
algorithm="AWQ",
242-
weight_dtype="int4_clip",
243-
)
244-
q_model = quantizer.quantize(
245-
quantization_config=quantization_config,
246-
calibration_dataset=calibration_dataset,
247-
save_directory=tmp_dir,
248-
)
249-
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
250-
out = model(inp)[0]
251-
q_out = q_model(inp)[0]
252-
self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1)))
253-
254-
with tempfile.TemporaryDirectory() as tmp_dir:
255-
quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation")
253+
calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2)
256254
q_model = quantizer.quantize(
257255
weight_only=True, # use RTN quantization method and NF4 weight data type is default.
258256
save_directory=tmp_dir,
259257
)
258+
q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir)
260259
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
261260
out = model(inp)[0]
262261
q_out = q_model(inp)[0]

tests/openvino/test_modeling_basic.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
This test is meant to run quickly with tiny test models. More extensive tests are in
88
test_modeling.py.
99
"""
10+
1011
# ruff: noqa
1112

1213
import gc

0 commit comments

Comments
 (0)