Skip to content

Commit 9416e9c

Browse files
committed
Merge from main branch
Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
2 parents 1d16103 + 80e9bf6 commit 9416e9c

14 files changed

+148
-59
lines changed

.github/workflows/test_openvino_basic.yml

+3-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
# optimum or transformers to a specific version
4242
# Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
4343
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
44-
pip install .[tests] openvino onnxruntime ${{ matrix.optimum}}
44+
pip install .[tests] openvino ${{ matrix.optimum}}
4545
4646
- name: Pip freeze
4747
run: pip freeze
@@ -52,6 +52,7 @@ jobs:
5252
5353
- name: Slow tests
5454
run: |
55-
pytest tests/openvino/test_modeling.py -s -m "run_slow" --durations=0
55+
pip install nncf
56+
pytest tests/openvino -s -m "run_slow" --durations=0
5657
env:
5758
RUN_SLOW: 1

optimum/commands/export/openvino.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from typing import TYPE_CHECKING, Optional
2121

2222
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
23-
from transformers.utils.quantization_config import QuantizationMethod
2423

2524
from ...exporters import TasksManager
2625
from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
@@ -289,7 +288,7 @@ def _get_default_int4_config(model_id_or_path, library_name):
289288
"all_layers": None if is_int8 else self.args.all_layers,
290289
"dataset": self.args.dataset,
291290
"num_samples": self.args.num_samples,
292-
"quant_method": QuantizationMethod.AWQ if self.args.awq else None,
291+
"quant_method": "awq" if self.args.awq else "default",
293292
"sensitivity_metric": self.args.sensitivity_metric,
294293
"scale_estimation": self.args.scale_estimation,
295294
}

optimum/exporters/openvino/model_configs.py

+32-17
Original file line numberDiff line numberDiff line change
@@ -167,24 +167,27 @@ def __init__(
167167
)
168168
self.multi_query_group_num = normalized_config.multi_query_group_num
169169
self.head_dim = normalized_config.kv_channels
170+
self.standart_cache_layout = hasattr(normalized_config, "rope_ratio")
170171

171172
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
172-
past_key_shape = (
173-
self.sequence_length,
174-
self.batch_size,
175-
self.multi_query_group_num,
176-
self.head_dim,
177-
)
178-
past_value_shape = (
179-
self.sequence_length,
180-
self.batch_size,
181-
self.multi_query_group_num,
182-
self.head_dim,
183-
)
173+
if not self.standart_cache_layout:
174+
pkv_shape = (
175+
self.sequence_length,
176+
self.batch_size,
177+
self.multi_query_group_num,
178+
self.head_dim,
179+
)
180+
else:
181+
pkv_shape = (
182+
self.batch_size,
183+
self.multi_query_group_num,
184+
self.sequence_length,
185+
self.head_dim,
186+
)
184187
return [
185188
(
186-
self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype),
187-
self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype),
189+
self.random_float_tensor(pkv_shape, framework=framework, dtype=float_dtype),
190+
self.random_float_tensor(pkv_shape, framework=framework, dtype=float_dtype),
188191
)
189192
for _ in range(self.num_layers)
190193
]
@@ -229,7 +232,10 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
229232
and "attention_mask" in dummy_inputs
230233
):
231234
# Obtain the past sequence length from the value instead of the key (Bloom). ChatGLM has seq_len in 0 dim instead of -2
232-
past_present_length = dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[0]
235+
seq_len_dim = 0 if not hasattr(self._normalized_config, "rope_ratio") else -2
236+
past_present_length = (
237+
dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[seq_len_dim]
238+
)
233239

234240
dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim(
235241
dummy_inputs["attention_mask"],
@@ -260,9 +266,18 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire
260266
decoder_sequence_name = "past_sequence_length + present_lenght"
261267
name = "present"
262268

269+
is_v4 = hasattr(self._normalized_config, "rope_ratio")
263270
for i in range(self._normalized_config.num_layers):
264-
inputs_or_outputs[f"{name}.{i}.key"] = {1: "batch_size", 0: decoder_sequence_name}
265-
inputs_or_outputs[f"{name}.{i}.value"] = {1: "batch_size", 0: decoder_sequence_name}
271+
inputs_or_outputs[f"{name}.{i}.key"] = (
272+
{1: "batch_size", 0: decoder_sequence_name}
273+
if not is_v4
274+
else {0: "batch_size", 2: decoder_sequence_name}
275+
)
276+
inputs_or_outputs[f"{name}.{i}.value"] = (
277+
{1: "batch_size", 0: decoder_sequence_name}
278+
if not is_v4
279+
else {0: "batch_size", 2: decoder_sequence_name}
280+
)
266281

267282
def patch_model_for_export(
268283
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None

optimum/exporters/openvino/model_patcher.py

+21-6
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def _chatglm_transformer_forward(
190190
if inputs_embeds is None:
191191
inputs_embeds = self.embedding(input_ids)
192192

193-
if self.pre_seq_len is not None:
193+
if getattr(self, "pre_seq_len", None) is not None:
194194
if past_key_values is None:
195195
past_key_values = self.get_prompt(
196196
batch_size=batch_size,
@@ -285,6 +285,17 @@ def _chatglm2_core_attention_forward(self, query_layer, key_layer, value_layer,
285285
return context_layer
286286

287287

288+
def _glm4_core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask):
289+
attention_mask = ~attention_mask
290+
context_layer = torch.nn.functional.scaled_dot_product_attention(
291+
query_layer, key_layer, value_layer, attention_mask.to(torch.float32)
292+
)
293+
context_layer = context_layer.transpose(1, 2).contiguous()
294+
new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
295+
context_layer = context_layer.reshape(*new_context_layer_shape)
296+
return context_layer
297+
298+
288299
class ChatGLMModelPatcher(DecoderModelPatcher):
289300
def __init__(
290301
self,
@@ -293,21 +304,25 @@ def __init__(
293304
model_kwargs: Dict[str, Any],
294305
):
295306
super().__init__(config, model, model_kwargs)
296-
297-
self.original_chatglm_transformer_forward = model.transformer.forward
307+
self.is_v4 = hasattr(self._model.config, "rope_ratio")
298308

299309
def __enter__(self):
300310
super().__enter__()
301-
self._model.transformer.forward = types.MethodType(_chatglm_transformer_forward, self._model.transformer)
311+
312+
if not self.is_v4:
313+
self._model.transformer._orig_forward = self._model.transformer.forward
314+
self._model.transformer.forward = types.MethodType(_chatglm_transformer_forward, self._model.transformer)
302315
for block in self._model.transformer.encoder.layers:
303316
block.self_attention.core_attention._orig_forward = block.self_attention.core_attention.forward
304317
block.self_attention.core_attention.forward = types.MethodType(
305-
_chatglm2_core_attention_forward, block.self_attention.core_attention
318+
_chatglm2_core_attention_forward if not self.is_v4 else _glm4_core_attention_forward,
319+
block.self_attention.core_attention,
306320
)
307321

308322
def __exit__(self, exc_type, exc_value, traceback):
309323
super().__exit__(exc_type, exc_value, traceback)
310-
self._model.transformer.forward = self.original_chatglm_transformer_forward
324+
if hasattr(self._model.transformer, "_orig_forward"):
325+
self._model.transformer.forward = self._model.transformer._orig_forward
311326
for block in self._model.transformer.encoder.layers:
312327
block.self_attention.core_attention.forward = block.self_attention.core_attention._orig_forward
313328

optimum/exporters/openvino/stateful.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model):
213213

214214
# By default, batch is the 0-th but chatglm uses 1-st dimension as batch
215215
# TODO: Deduce from a model via ordinal reshape (?) and topology
216-
batch_dim = 1 if config.model_type == "chatglm" else 0
216+
batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0
217217

218218
fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)
219219
num_attention_heads = config.num_attention_heads if config.model_type == "bloom" else 1

optimum/intel/openvino/configuration.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import torch
2222
from transformers import PretrainedConfig
23-
from transformers.utils.quantization_config import QuantizationConfigMixin, QuantizationMethod
23+
from transformers.utils.quantization_config import QuantizationConfigMixin
2424

2525
from optimum.configuration_utils import BaseConfig
2626

@@ -78,6 +78,7 @@
7878
class OVQuantizationMethod(str, Enum):
7979
DEFAULT = "default"
8080
HYBRID = "hybrid"
81+
AWQ = "awq"
8182

8283

8384
@dataclass
@@ -171,7 +172,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
171172
entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class.
172173
num_samples (`int`, *optional*):
173174
The maximum number of samples composing the calibration dataset.
174-
quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT):
175+
quant_method (`str or OVQuantizationMethod`, defaults of OVQuantizationMethod.DEFAULT):
175176
Weight compression method to apply. Possible options:
176177
- "default": default weight quantization will be applied.
177178
- "awq": compressed weights will be computed according to the Activation-Aware-Quantization (AWQ)
@@ -199,7 +200,7 @@ def __init__(
199200
sensitivity_metric: Optional[str] = None,
200201
ignored_scope: Optional[dict] = None,
201202
num_samples: Optional[int] = None,
202-
quant_method: Union[QuantizationMethod, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT,
203+
quant_method: Union[str, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT,
203204
scale_estimation: bool = None,
204205
**kwargs,
205206
):
@@ -210,7 +211,7 @@ def __init__(
210211
self.ratio = ratio
211212
self.all_layers = all_layers
212213
self.sensitivity_metric = sensitivity_metric
213-
self.quant_method = quant_method
214+
self.quant_method = OVQuantizationMethod(quant_method) if isinstance(quant_method, str) else quant_method
214215
self.scale_estimation = scale_estimation
215216
self.post_init()
216217

optimum/intel/openvino/modeling_decoder.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -328,9 +328,9 @@ def _reshape(
328328
shapes[inputs][0] = -1
329329
input_name = inputs.get_any_name()
330330
if input_name.startswith("past_key_values"):
331-
if (
332-
len(inputs.partial_shape) == 3 and input_name.endswith("value")
333-
) or self.config.model_type == "chatglm":
331+
if (len(inputs.partial_shape) == 3 and input_name.endswith("value")) or (
332+
self.config.model_type == "chatglm" and not hasattr(self.config, "rope_ratio")
333+
):
334334
shapes[inputs][1] = -1
335335
else:
336336
shapes[inputs][2] = -1
@@ -421,7 +421,7 @@ def prepare_inputs(
421421
model_inputs = self.model.input(input_name)
422422
dtype = OV_TO_NP_TYPE[model_inputs.get_element_type().get_type_name()]
423423
shape = model_inputs.get_partial_shape()
424-
if self.config.model_type == "chatglm":
424+
if self.config.model_type == "chatglm" and not hasattr(self.config, "rope_ratio"):
425425
shape[0] = 0
426426
shape[1] = batch_size
427427
else:
@@ -573,7 +573,7 @@ def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_ke
573573
tuple(
574574
(
575575
past_state[indicies]
576-
if not self.config.model_type == "chatglm"
576+
if not (self.config.model_type == "chatglm" and not hasattr(self.config, "rope_ratio"))
577577
else past_state[:, indicies, ...]
578578
)
579579
for past_state in layer_past
@@ -607,7 +607,13 @@ def _deduplicate_inputs(self, model_inputs: Dict):
607607
upd_batch_size = indicies.shape[0]
608608
if self.config.model_type == "bloom":
609609
upd_batch_size *= self.config.num_attention_heads
610-
shape[0 if not self.config.model_type == "chatglm" else 1] = upd_batch_size
610+
shape[
611+
(
612+
0
613+
if not (self.config.model_type == "chatglm" and not hasattr(self.config, "rope_ratio"))
614+
else 1
615+
)
616+
] = upd_batch_size
611617
upd_model_inputs[input_name] = Tensor(dtype, shape)
612618
upd_model_inputs["input_ids"] = unique_input_ids
613619
if "beam_idx" in model_inputs:
@@ -675,7 +681,7 @@ def _get_past_length(self, past_key_values=None):
675681
):
676682
return past_key_values[0].shape[-2]
677683
seq_length_dim = -2
678-
if self.config.model_type == "chatglm":
684+
if self.config.model_type == "chatglm" and not hasattr(self.config, "rope_ratio"):
679685
seq_length_dim = 0
680686
elif self.config.model_type == "qwen":
681687
seq_length_dim = 1

optimum/intel/openvino/quantization.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
3939
from transformers.pytorch_utils import Conv1D
4040
from transformers.utils import is_accelerate_available
41-
from transformers.utils.quantization_config import QuantizationMethod
4241

4342
from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
4443
from optimum.exporters.tasks import TasksManager
@@ -828,7 +827,7 @@ def _weight_only_quantization(
828827
group_size=config.group_size,
829828
all_layers=config.all_layers,
830829
sensitivity_metric=sensitivity_metric,
831-
awq=config.quant_method == QuantizationMethod.AWQ or None,
830+
awq=getattr(config.quant_method, "name", "") == "AWQ" or None,
832831
ignored_scope=config.get_ignored_scope_instance(),
833832
dataset=dataset,
834833
subset_size=config.num_samples if config.num_samples else 128,

pyproject.toml

+5
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,8 @@ line-length = 119
2929
[tool.ruff.isort]
3030
lines-after-imports = 2
3131
known-first-party = ["optimum"]
32+
33+
[tool.pytest.ini_options]
34+
markers = [
35+
"run_slow",
36+
]

tests/openvino/test_modeling.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
643643
"xverse",
644644
"internlm",
645645
"jais",
646+
"glm4",
646647
)
647648

648649
if is_transformers_version(">=", "4.40.0"):
@@ -675,6 +676,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
675676
"internlm",
676677
"codegen2",
677678
"arctic",
679+
"glm4",
678680
)
679681

680682
@parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -716,7 +718,7 @@ def test_compare_to_transformers(self, model_arch):
716718

717719
set_seed(SEED)
718720
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
719-
if model_arch in ["qwen", "arctic"]:
721+
if model_arch in ["qwen", "arctic", "glm4"]:
720722
transformers_model.to(torch.float32)
721723

722724
with torch.no_grad():
@@ -729,7 +731,7 @@ def test_compare_to_transformers(self, model_arch):
729731
if model_arch == "qwen":
730732
return
731733

732-
if model_arch not in ["chatglm", "persimmon"]:
734+
if model_arch not in ["chatglm", "glm4", "persimmon"]:
733735
tokenizer.pad_token_id = tokenizer.eos_token_id
734736

735737
if model_arch == "persimmon":
@@ -990,14 +992,21 @@ def test_beam_search(self, model_arch):
990992
ov_model_stateless.config.eos_token_id = None
991993
transformers_model.config.eos_token_id = None
992994

993-
for idx, gen_config in enumerate(gen_configs):
995+
for gen_config in gen_configs:
994996
if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo"]:
995997
continue
998+
996999
transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
9971000
ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config)
998-
self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs), f"generation config : {idx}")
1001+
self.assertTrue(
1002+
torch.equal(ov_stateful_outputs, transformers_outputs),
1003+
f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model_stateful output {ov_stateful_outputs}",
1004+
)
9991005
ov_stateless_outputs = ov_model_stateless.generate(**tokens, generation_config=gen_config)
1000-
self.assertTrue(torch.allclose(ov_stateless_outputs, transformers_outputs), f"generation config : {idx}")
1006+
self.assertTrue(
1007+
torch.equal(ov_stateless_outputs, transformers_outputs),
1008+
f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model_stateless output {ov_stateless_outputs}",
1009+
)
10011010

10021011

10031012
class OVModelForMaskedLMIntegrationTest(unittest.TestCase):

0 commit comments

Comments
 (0)