Skip to content

Commit 4011612

Browse files
committed
Merge remote-tracking branch 'upstream/main' into penghuic/weight_only_with_itrex
2 parents f5363e7 + b7703dc commit 4011612

File tree

2 files changed

+66
-3
lines changed

2 files changed

+66
-3
lines changed

optimum/intel/openvino/modeling_decoder.py

+43-3
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,34 @@ def _from_transformers(
227227
if use_cache:
228228
task = task + "-with-past"
229229

230+
# Patch the modules to export of GPTQ models w/o GPU
231+
do_gptq_patching = False
232+
config_dict = config.to_dict()
233+
quantization_config = config_dict.get("quantization_config", None)
234+
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
235+
if do_gptq_patching:
236+
torch.set_default_dtype(torch.float32)
237+
orig_cuda_check = torch.cuda.is_available
238+
torch.cuda.is_available = lambda: True
239+
240+
from optimum.gptq import GPTQQuantizer
241+
242+
orig_post_init_model = GPTQQuantizer.post_init_model
243+
244+
def post_init_model(self, model):
245+
from auto_gptq import exllama_set_max_input_length
246+
247+
class StoreAttr(object):
248+
pass
249+
250+
model.quantize_config = StoreAttr()
251+
model.quantize_config.desc_act = self.desc_act
252+
if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
253+
model = exllama_set_max_input_length(model, self.max_input_length)
254+
return model
255+
256+
GPTQQuantizer.post_init_model = post_init_model
257+
230258
main_export(
231259
model_name_or_path=model_id,
232260
output=save_dir_path,
@@ -238,10 +266,14 @@ def _from_transformers(
238266
local_files_only=local_files_only,
239267
force_download=force_download,
240268
trust_remote_code=trust_remote_code,
241-
model_kwargs=kwargs,
242269
int8=load_in_8bit,
243270
)
244271

272+
# Unpatch modules after GPTQ export
273+
if do_gptq_patching:
274+
torch.cuda.is_available = orig_cuda_check
275+
GPTQQuantizer.post_init_model = orig_post_init_model
276+
245277
config.is_decoder = True
246278
config.is_encoder_decoder = False
247279
config.save_pretrained(save_dir_path)
@@ -320,7 +352,10 @@ def forward(
320352
input_ids = input_ids[:, -1:]
321353

322354
inputs = {}
355+
past_len = 0
323356
if past_key_values is not None:
357+
seq_len_dim = 1 if self.model.input(self.key_value_input_names[0]).get_partial_shape()[1].is_dynamic else 2
358+
past_len = past_key_values[0][0].shape[seq_len_dim]
324359
if self._pkv_precision == Type.bf16:
325360
# numpy does not support bf16, pretending f16, should change to bf16
326361
past_key_values = tuple(
@@ -355,8 +390,13 @@ def forward(
355390
inputs["input_ids"] = np.array(input_ids)
356391

357392
# Add the attention_mask inputs when needed
358-
if "attention_mask" in self.input_names and attention_mask is not None:
359-
inputs["attention_mask"] = np.array(attention_mask)
393+
if "attention_mask" in self.input_names:
394+
if attention_mask is not None:
395+
inputs["attention_mask"] = np.array(attention_mask)
396+
else:
397+
inputs["attention_mask"] = np.ones(
398+
(input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype
399+
)
360400

361401
# Run inference
362402
self.request.start_async(inputs, shared_memory=True)

tests/openvino/test_modeling.py

+23
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,29 @@ def test_auto_device_loading(self):
573573
del model
574574
gc.collect()
575575

576+
def test_default_filling_attention_mask(self):
577+
model_id = MODEL_NAMES["gpt2"]
578+
model_with_cache = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True)
579+
tokenizer = AutoTokenizer.from_pretrained(model_id)
580+
tokenizer.pad_token = tokenizer.eos_token
581+
texts = ["this is a simple input"]
582+
tokens = tokenizer(texts, return_tensors="pt")
583+
self.assertTrue("attention_mask" in model_with_cache.input_names)
584+
outs = model_with_cache(**tokens)
585+
attention_mask = tokens.pop("attention_mask")
586+
outs_without_attn_mask = model_with_cache(**tokens)
587+
self.assertTrue(torch.allclose(outs.logits, outs_without_attn_mask.logits))
588+
input_ids = torch.argmax(outs.logits, dim=2)
589+
past_key_values = outs.past_key_values
590+
attention_mask = torch.ones((input_ids.shape[0], tokens.input_ids.shape[1] + 1), dtype=torch.long)
591+
outs_step2 = model_with_cache(
592+
input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values
593+
)
594+
outs_without_attn_mask_step2 = model_with_cache(input_ids=input_ids, past_key_values=past_key_values)
595+
self.assertTrue(torch.allclose(outs_step2.logits, outs_without_attn_mask_step2.logits))
596+
del model_with_cache
597+
gc.collect()
598+
576599

577600
class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
578601
SUPPORTED_ARCHITECTURES = (

0 commit comments

Comments
 (0)