Skip to content

Commit f970272

Browse files
committed
Load weight-only quantized model with INCModelForCausalLM
Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
1 parent f51266a commit f970272

File tree

4 files changed

+28
-16
lines changed

4 files changed

+28
-16
lines changed

examples/neural_compressor/language-modeling/run_clm.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@
6363
if is_intel_extension_for_transformers_available():
6464
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
6565

66-
from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM
6766

6867
os.environ["CUDA_VISIBLE_DEVICES"] = ""
6968

@@ -777,10 +776,7 @@ def compute_metrics(eval_preds):
777776
trainer.model = quantizer._quantized_model
778777

779778
if optim_args.apply_quantization and optim_args.verify_loading:
780-
if optim_args.quantization_approach == "weight_only":
781-
loaded_model = ITREXAutoModelForCausalLM.from_pretrained(training_args.output_dir)
782-
else:
783-
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
779+
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
784780
tokens = tokenizer("This is a sample input", return_tensors="pt")
785781
with torch.no_grad():
786782
original_model_outputs = trainer.model(**tokens)

optimum/intel/neural_compressor/__init__.py

-4
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,3 @@
3232

3333
if is_diffusers_available():
3434
from .modeling_diffusion import INCStableDiffusionPipeline
35-
36-
37-
if is_intel_extension_for_transformers_available():
38-
from .modeling_base import ITREXAutoModelForCausalLM

optimum/intel/neural_compressor/modeling_base.py

+26-5
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,7 @@
6565

6666
if is_intel_extension_for_transformers_available():
6767
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM as ITREX_WOQ_MODEL
68-
69-
class ITREXAutoModelForCausalLM(ITREX_WOQ_MODEL):
70-
auto_model_class = AutoModelForCausalLM
71-
export_feature = "text-generation"
72-
68+
from intel_extension_for_transformers.transformers.utils import WeightOnlyQuantConfig
7369

7470
class INCModel(OptimizedModel):
7571
auto_model_class = AutoModel
@@ -138,6 +134,31 @@ def _from_pretrained(
138134
model_save_dir = Path(model_cache_path).parent
139135
inc_config = None
140136
msg = None
137+
try:
138+
quantization_config = WeightOnlyQuantConfig.from_pretrained(model_id)
139+
if getattr(quantization_config, "algorithm", None) is not None and quantization_config.algorithm.lower() in [
140+
"rtn", "gptq", "awq", "autoaround"
141+
]:
142+
if not is_intel_extension_for_transformers_available():
143+
raise ImportError(
144+
"Didn't find out intel-etension-for-transformers package. "
145+
"Please install packages: pip install intel-etension-for-transformers and pip install peft."
146+
)
147+
return ITREX_WOQ_MODEL.from_pretrained(
148+
pretrained_model_name_or_path=model_id,
149+
use_auth_token=use_auth_token,
150+
revision=revision,
151+
force_download=force_download,
152+
cache_dir=cache_dir,
153+
local_files_only=local_files_only,
154+
subfolder=subfolder,
155+
trust_remote_code=trust_remote_code,
156+
**kwargs,
157+
)
158+
except EnvironmentError:
159+
msg = (
160+
"The model is not quantized with weight-only quantization."
161+
)
141162
try:
142163
inc_config = INCConfig.from_pretrained(model_id)
143164
if not is_torch_version("==", inc_config.torch_version):

tests/neural_compressor/test_optimization.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@
6565
from optimum.pipelines import ORT_SUPPORTED_TASKS
6666

6767
if is_intel_extension_for_transformers_available():
68-
from optimum.intel.neural_compressor import ITREXAutoModelForCausalLM
6968
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
7069

7170
os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -244,7 +243,7 @@ def test_weight_only_quantization(self, no_config, algo, weight_dtype):
244243
weight_only=True, # use RTN quantization method and NF4 weight data type is default.
245244
save_directory=tmp_dir,
246245
)
247-
q_model = ITREXAutoModelForCausalLM.from_pretrained(tmp_dir)
246+
q_model = INCModelForCausalLM.from_pretrained(tmp_dir)
248247
inp = torch.tensor([calibration_dataset[0]["input_ids"]])
249248
out = model(inp)[0]
250249
q_out = q_model(inp)[0]

0 commit comments

Comments
 (0)