Skip to content

Commit 4ddc352

Browse files
authored
Merge branch 'huggingface:main' into qwen
2 parents 8656c26 + 2b0d642 commit 4ddc352

13 files changed

+195
-61
lines changed

.github/workflows/build_pr_documentation.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
echo ${{ env.COMMIT_SHA }} > ./commit_sha
6161
echo ${{ env.PR_NUMBER }} > ./pr_number
6262
63-
- uses: actions/upload-artifact@v3
63+
- uses: actions/upload-artifact@v4
6464
with:
6565
name: doc-build-artifact
6666
path: optimum-intel/intel-doc-build/

.github/workflows/test_ipex.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
strategy:
1919
fail-fast: false
2020
matrix:
21-
transformers-version: ["4.46.0", "4.46.3"]
21+
transformers-version: ["4.47.0", "4.47.1"]
2222
torch-version: ["2.4.0", "2.5.*"]
2323

2424
runs-on: ubuntu-22.04

Dockerfile.ipex

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ ARG KMP_HW_SUBSET=1T
4343
ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
4444
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
4545

46-
FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
46+
FROM intel/intel-extension-for-pytorch:2.5.10-xpu as xpu
4747
WORKDIR /usr/src/
4848

4949
RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \

optimum/commands/export/openvino.py

+5
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,11 @@ def run(self):
364364
quantization_config["trust_remote_code"] = self.args.trust_remote_code
365365
ov_config = OVConfig(quantization_config=quantization_config)
366366
else:
367+
if self.args.dataset is None:
368+
raise ValueError(
369+
"Dataset is required for full quantization. Please provide it with --dataset argument."
370+
)
371+
367372
quantization_config = {
368373
"weight_format": self.args.quant_mode,
369374
"activation_format": self.args.quant_mode,

optimum/exporters/ipex/cache_utils.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -34,22 +34,23 @@ class IPEXPagedCache(Cache):
3434
def __init__(
3535
self,
3636
config: PretrainedConfig,
37-
batch_size: int,
37+
max_batch_size: int,
3838
max_cache_len: int,
3939
device,
4040
dtype=None,
4141
layer_device_map=None,
4242
**kwargs,
4343
) -> None:
4444
super().__init__()
45-
self.batch_size = batch_size
45+
self.max_batch_size = max_batch_size
4646
# Used in `generate` to keep tally of how many tokens the cache has seen
47-
self._seen_tokens = torch.zeros([batch_size], dtype=torch.int32, device=device)
47+
48+
self._seen_tokens = torch.zeros([max_batch_size], dtype=torch.int32, device=device)
4849
default_block_size = 16 if device.type == "cpu" else 64
4950
self.block_size = int(os.environ.get("OI_PAGED_ATTN_BLOCK_SIZE", str(default_block_size)))
50-
self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * batch_size
51+
self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * max_batch_size
5152
self.block_tables = -1 * torch.ones([self.num_blocks], dtype=torch.int32, device=device).reshape(
52-
batch_size, -1
53+
max_batch_size, -1
5354
)
5455
self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=device)
5556
self.max_cache_len = max_cache_len
@@ -193,7 +194,7 @@ def get_max_length(self) -> Optional[int]:
193194

194195
def reset(self):
195196
"""Resets the cache values while preserving the objects"""
196-
self._seen_tokens = torch.zeros([self.batch_size], dtype=torch.int32, device=self.block_tables.device)
197+
self._seen_tokens = torch.zeros([self.max_batch_size], dtype=torch.int32, device=self.block_tables.device)
197198
self.block_tables.fill_(-1)
198199
self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=self.block_tables.device)
199200
self.max_seq_len = 0

optimum/exporters/ipex/model_patcher.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@
4848

4949

5050
# Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
51-
_TRANSFORMERS_MIN_VERSION = "4.46.0"
52-
_TRANSFORMERS_MAX_VERSION = "4.46.99"
51+
_TRANSFORMERS_MIN_VERSION = "4.47.0"
52+
_TRANSFORMERS_MAX_VERSION = "4.47.99"
5353

5454
_IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)
5555

optimum/exporters/openvino/model_configs.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@
8787
InputEmbeddingPatcher,
8888
InternLM2Patcher,
8989
InternLMModelPatcher,
90+
InternVL2ChatLangModelPatcher,
9091
InternVLChatImageEmbeddingModelPatcher,
9192
JaisModelPatcher,
9293
LlamaModelPatcher,
@@ -1642,7 +1643,11 @@ def with_behavior(
16421643
if behavior == InternVLChatConfigBehavior.LANGUAGE:
16431644
model_type = self._orig_config.llm_config.model_type
16441645
return get_vlm_text_generation_config(
1645-
model_type, self._orig_config.llm_config, self.int_dtype, self.float_dtype
1646+
model_type,
1647+
self._orig_config.llm_config,
1648+
self.int_dtype,
1649+
self.float_dtype,
1650+
InternVL2ChatLangModelPatcher,
16461651
)
16471652

16481653
if behavior == InternVLChatConfigBehavior.VISION_EMBEDDINGS:

optimum/exporters/openvino/model_patcher.py

+81
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import torch
2323
import torch.nn.functional as F
24+
from transformers import PreTrainedModel, TFPreTrainedModel
2425
from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling
2526
from transformers.utils import is_tf_available
2627

@@ -2992,11 +2993,91 @@ def __init__(
29922993
model.__orig_forward = model.forward
29932994
model.forward = model.extract_feature
29942995

2996+
if model.vision_model.encoder.layers[0].attn.use_flash_attn:
2997+
for layer in model.vision_model.encoder.layers:
2998+
layer.attn._orig_use_flash_attn = layer.attn.use_flash_attn
2999+
layer.attn.use_flash_attn = False
3000+
29953001
super().__init__(config, model, model_kwargs)
29963002

29973003
def __exit__(self, exc_type, exc_value, traceback):
29983004
super().__exit__(exc_type, exc_value, traceback)
29993005
self._model.forward = self._model.__orig_forward
3006+
if hasattr(self._model.vision_model.encoder.layers[0].attn, "_orig_use_flash_attn"):
3007+
for layer in self._model.vision_model.encoder.layers:
3008+
layer.attn.use_flash_attn = layer.attn._orig_use_flash_attn
3009+
3010+
3011+
class InternVL2ChatLangModelPatcher(DecoderModelPatcher):
3012+
def __init__(
3013+
self, config: "OnnxConfig", model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Dict[str, Any]
3014+
):
3015+
model_type = model.config.model_type
3016+
patcher_for_model_type = {
3017+
"llama": LlamaModelPatcher,
3018+
"qwen2": UpdateCausalMaskModelPatcher,
3019+
"phi3": Phi3ModelPatcher,
3020+
"internlm2": InternLM2Patcher,
3021+
}
3022+
self._internal_patcher = None
3023+
self._patched_forward = None
3024+
internal_patcher_cls = patcher_for_model_type.get(model_type)
3025+
if internal_patcher_cls is not None:
3026+
self._internal_patcher = internal_patcher_cls(config, model, model_kwargs)
3027+
self._patched_forward = self._internal_patcher.patched_forward
3028+
super().__init__(config, model, model_kwargs)
3029+
3030+
@property
3031+
def patched_forward(self):
3032+
if self._internal_patcher is not None:
3033+
return self._internal_patcher.patched_forward
3034+
return self._patched_forward
3035+
3036+
@patched_forward.setter
3037+
def patched_forward(self, fn):
3038+
self._patched_forward = fn
3039+
if self._internal_patcher is not None:
3040+
self._internal_patcher.patched_forward = fn
3041+
3042+
def __enter__(self):
3043+
if is_torch_version(">=", "2.1.0"):
3044+
if self._model.config.model_type == "qwen2" and self._model.config._attn_implementation != "sdpa":
3045+
from transformers.models.qwen2.modeling_qwen2 import QWEN2_ATTENTION_CLASSES
3046+
3047+
sdpa_attn = QWEN2_ATTENTION_CLASSES["sdpa"]
3048+
self._model.config._orig_attn_implementation = self._model.config._attn_implementation
3049+
self._model.config._attn_implementation = "sdpa"
3050+
3051+
for layer in self._model.model.layers:
3052+
layer.self_attn._orig_forward = layer.self_attn.forward
3053+
layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn)
3054+
3055+
if self._model.config.model_type == "llama" and self._model.config._attn_implementation != "sdpa":
3056+
self._model.config._orig_attn_implementation = self._model.config._attn_implementation
3057+
self._model.config._attn_implementation = "sdpa"
3058+
if is_transformers_version("<", "4.47"):
3059+
from transformers.models.llama.modeling_llama import LLAMA_ATTENTION_CLASSES
3060+
3061+
sdpa_attn = LLAMA_ATTENTION_CLASSES["sdpa"]
3062+
for layer in self._model.model.layers:
3063+
layer.self_attn._orig_forward = layer.self_attn.forward
3064+
layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn)
3065+
3066+
if self._internal_patcher is not None:
3067+
return self._internal_patcher.__enter__()
3068+
return super().__enter__()
3069+
3070+
def __exit__(self, exc_type, exc_value, traceback):
3071+
if self._internal_patcher:
3072+
self._internal_patcher.__exit__(exc_type, exc_value, traceback)
3073+
else:
3074+
super().__exit__(exc_type, exc_value, traceback)
3075+
3076+
if hasattr(self._model.config, "_orig_attn_implementation"):
3077+
self._model.config._attn_implementation = self._model.config._orig_attn_implementation
3078+
for layer in self._model.model.layers:
3079+
if hasattr(layer.self_attn, "_orig_forward"):
3080+
layer.self_attn.forward = layer.self_attn._orig_forward
30003081

30013082

30023083
def llava_vision_embed_forward(self, pixel_values):

optimum/intel/openvino/configuration.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,12 @@
2626
from optimum.configuration_utils import BaseConfig
2727

2828
from ..utils.import_utils import is_nncf_available
29-
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
29+
from .utils import (
30+
LANGUAGE_DATASETS,
31+
PREDEFINED_SD_DATASETS,
32+
PREDEFINED_SPEECH_TO_TEXT_DATASETS,
33+
PREDEFINED_VISUAL_LM_DATASETS,
34+
)
3035

3136

3237
if is_nncf_available():
@@ -467,13 +472,12 @@ def post_init(self):
467472
f"If you wish to provide a custom dataset, please use the `OVQuantizer` instead."
468473
)
469474
if self.dataset is not None and isinstance(self.dataset, str):
470-
lm_datasets = ["wikitext2", "c4", "c4-new", "auto"]
471475
visual_lm_datasets = list(PREDEFINED_VISUAL_LM_DATASETS.keys())
472476
stable_diffusion_datasets = list(PREDEFINED_SD_DATASETS.keys())
473-
if self.dataset not in lm_datasets + visual_lm_datasets + stable_diffusion_datasets:
477+
if self.dataset not in LANGUAGE_DATASETS + visual_lm_datasets + stable_diffusion_datasets:
474478
raise ValueError(
475479
f"""You have entered a string value for dataset. You can only choose between
476-
{lm_datasets} for LLMs, {visual_lm_datasets} for visual LLMs
480+
{LANGUAGE_DATASETS} for LLMs, {visual_lm_datasets} for visual LLMs
477481
or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
478482
)
479483

@@ -617,7 +621,8 @@ def __init__(
617621
overflow_fix (`str`, default to "disable"):
618622
Parameter for controlling overflow fix setting.
619623
dataset (`str`, *optional*):
620-
The dataset used for quantization. For text-to-speech model quantization the allowed value is 'librispeech'.
624+
The dataset used for quantization. For language models the allowed values are
625+
['auto', 'wikitext2','c4','c4-new']. For text-to-speech model quantization the allowed value is 'librispeech'.
621626
tokenizer (`str`, *optional*):
622627
The tokenizer used to process the dataset. You can pass either:
623628
- A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
@@ -673,6 +678,14 @@ def post_init(self):
673678
"""
674679
super().post_init()
675680

681+
if self.dataset is not None:
682+
speech_to_text_datasets = list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())
683+
if self.dataset not in LANGUAGE_DATASETS + speech_to_text_datasets:
684+
raise ValueError(
685+
f"""You can only choose between the following datasets: {LANGUAGE_DATASETS} for LLMs or
686+
{speech_to_text_datasets} for speech-to-text models, but we found {self.dataset}."""
687+
)
688+
676689
if self.bits != 8:
677690
raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")
678691

optimum/intel/openvino/utils.py

+2
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@
136136
}
137137

138138

139+
LANGUAGE_DATASETS = ["wikitext2", "c4", "c4-new", "auto"]
140+
139141
PREDEFINED_SD_DATASETS = {
140142
"conceptual_captions": {"split": "train", "inputs": {"prompt": "caption"}},
141143
"laion/220k-GPT4Vision-captions-from-LIVIS": {"split": "train", "inputs": {"prompt": "caption"}},

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
"nncf": ["nncf>=2.14.0"],
6767
"openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"],
6868
"neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
69-
"ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.45,<4.47", "accelerate"],
69+
"ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.46,<4.48", "accelerate"],
7070
"diffusers": ["diffusers"],
7171
"quality": QUALITY_REQUIRE,
7272
"tests": TESTS_REQUIRE,

tests/openvino/test_exporters_cli.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -365,19 +365,21 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
365365
self.assertEqual(expected_int8[i], num_weight_nodes["int8"])
366366

367367
@parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
368-
def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int):
368+
def test_exporters_cli_hybrid_quantization(
369+
self, model_type: str, expected_fake_nodes: int, expected_int8_nodes: int
370+
):
369371
with TemporaryDirectory() as tmpdir:
370372
subprocess.run(
371373
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --dataset laion/filtered-wit --weight-format int8 {tmpdir}",
372374
shell=True,
373375
check=True,
374376
)
375377
model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir)
376-
num_fq, num_weight_nodes = get_num_quantized_nodes(
378+
num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(
377379
model.unet if model.unet is not None else model.transformer
378380
)
379-
self.assertEqual(exp_num_int8, num_weight_nodes["int8"])
380-
self.assertEqual(exp_num_fq, num_fq)
381+
self.assertEqual(expected_int8_nodes, num_weight_nodes["int8"])
382+
self.assertEqual(expected_fake_nodes, num_fake_nodes)
381383

382384
@parameterized.expand(TEST_4BIT_CONFIGURATIONS)
383385
def test_exporters_cli_4bit(
@@ -422,8 +424,8 @@ def test_exporters_cli_full_quantization(
422424
model_type: str,
423425
quant_mode: str,
424426
option: str,
425-
expected_num_f_nodes_per_model: Tuple[int],
426-
expected_num_weight_nodes_per_model: Tuple[int],
427+
expected_fake_nodes: Tuple[int],
428+
expected_low_precision_nodes: Tuple[int],
427429
):
428430
with TemporaryDirectory() as tmpdir:
429431
subprocess.run(
@@ -439,12 +441,12 @@ def test_exporters_cli_full_quantization(
439441
if model.decoder_with_past is not None:
440442
models.append(model.decoder_with_past)
441443
else:
442-
expected_num_f_nodes_per_model = expected_num_f_nodes_per_model[:-1]
443-
self.assertEqual(len(expected_num_f_nodes_per_model), len(models))
444+
expected_fake_nodes = expected_fake_nodes[:-1]
445+
self.assertEqual(len(expected_fake_nodes), len(models))
444446
for i, model in enumerate(models):
445-
actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
446-
self.assertEqual(expected_num_f_nodes_per_model[i], actual_num_f_nodes)
447-
self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode])
447+
num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(model)
448+
self.assertEqual(expected_fake_nodes[i], num_fake_nodes)
449+
self.assertEqual(expected_low_precision_nodes[i], num_weight_nodes[quant_mode])
448450

449451
def test_exporters_cli_int4_with_local_model_and_default_config(self):
450452
with TemporaryDirectory() as tmpdir:

0 commit comments

Comments
 (0)