Merge branch 'huggingface:main' into qwen

jiqing-feng · web-flow · commit 4ddc352809e3 · 2025-01-22T14:03:10.000+08:00
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
@@ -60,7 +60,7 @@ jobs:
           echo ${{ env.COMMIT_SHA }} > ./commit_sha
           echo ${{ env.PR_NUMBER }} > ./pr_number
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: doc-build-artifact
           path: optimum-intel/intel-doc-build/
diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        transformers-version: ["4.46.0", "4.46.3"]
+        transformers-version: ["4.47.0", "4.47.1"]
         torch-version: ["2.4.0", "2.5.*"]
 
     runs-on: ubuntu-22.04
diff --git a/Dockerfile.ipex b/Dockerfile.ipex
@@ -43,7 +43,7 @@ ARG KMP_HW_SUBSET=1T
 ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
 
-FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
+FROM intel/intel-extension-for-pytorch:2.5.10-xpu as xpu
 WORKDIR /usr/src/
 
 RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -364,6 +364,11 @@ def run(self):
                 quantization_config["trust_remote_code"] = self.args.trust_remote_code
             ov_config = OVConfig(quantization_config=quantization_config)
         else:
+            if self.args.dataset is None:
+                raise ValueError(
+                    "Dataset is required for full quantization. Please provide it with --dataset argument."
+                )
+
             quantization_config = {
                 "weight_format": self.args.quant_mode,
                 "activation_format": self.args.quant_mode,
diff --git a/optimum/exporters/ipex/cache_utils.py b/optimum/exporters/ipex/cache_utils.py
@@ -34,22 +34,23 @@ class IPEXPagedCache(Cache):
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int,
+        max_batch_size: int,
         max_cache_len: int,
         device,
         dtype=None,
         layer_device_map=None,
         **kwargs,
     ) -> None:
         super().__init__()
-        self.batch_size = batch_size
+        self.max_batch_size = max_batch_size
         # Used in `generate` to keep tally of how many tokens the cache has seen
-        self._seen_tokens = torch.zeros([batch_size], dtype=torch.int32, device=device)
+
+        self._seen_tokens = torch.zeros([max_batch_size], dtype=torch.int32, device=device)
         default_block_size = 16 if device.type == "cpu" else 64
         self.block_size = int(os.environ.get("OI_PAGED_ATTN_BLOCK_SIZE", str(default_block_size)))
-        self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * batch_size
+        self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * max_batch_size
         self.block_tables = -1 * torch.ones([self.num_blocks], dtype=torch.int32, device=device).reshape(
-            batch_size, -1
+            max_batch_size, -1
         )
         self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=device)
         self.max_cache_len = max_cache_len
@@ -193,7 +194,7 @@ def get_max_length(self) -> Optional[int]:
 
     def reset(self):
         """Resets the cache values while preserving the objects"""
-        self._seen_tokens = torch.zeros([self.batch_size], dtype=torch.int32, device=self.block_tables.device)
+        self._seen_tokens = torch.zeros([self.max_batch_size], dtype=torch.int32, device=self.block_tables.device)
         self.block_tables.fill_(-1)
         self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=self.block_tables.device)
         self.max_seq_len = 0
diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
@@ -48,8 +48,8 @@
 
 
 # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
-_TRANSFORMERS_MIN_VERSION = "4.46.0"
-_TRANSFORMERS_MAX_VERSION = "4.46.99"
+_TRANSFORMERS_MIN_VERSION = "4.47.0"
+_TRANSFORMERS_MAX_VERSION = "4.47.99"
 
 _IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)
 
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -87,6 +87,7 @@
     InputEmbeddingPatcher,
     InternLM2Patcher,
     InternLMModelPatcher,
+    InternVL2ChatLangModelPatcher,
     InternVLChatImageEmbeddingModelPatcher,
     JaisModelPatcher,
     LlamaModelPatcher,
@@ -1642,7 +1643,11 @@ def with_behavior(
         if behavior == InternVLChatConfigBehavior.LANGUAGE:
             model_type = self._orig_config.llm_config.model_type
             return get_vlm_text_generation_config(
-                model_type, self._orig_config.llm_config, self.int_dtype, self.float_dtype
+                model_type,
+                self._orig_config.llm_config,
+                self.int_dtype,
+                self.float_dtype,
+                InternVL2ChatLangModelPatcher,
             )
 
         if behavior == InternVLChatConfigBehavior.VISION_EMBEDDINGS:
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -21,6 +21,7 @@
 
 import torch
 import torch.nn.functional as F
+from transformers import PreTrainedModel, TFPreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling
 from transformers.utils import is_tf_available
 
@@ -2992,11 +2993,91 @@ def __init__(
         model.__orig_forward = model.forward
         model.forward = model.extract_feature
 
+        if model.vision_model.encoder.layers[0].attn.use_flash_attn:
+            for layer in model.vision_model.encoder.layers:
+                layer.attn._orig_use_flash_attn = layer.attn.use_flash_attn
+                layer.attn.use_flash_attn = False
+
         super().__init__(config, model, model_kwargs)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         self._model.forward = self._model.__orig_forward
+        if hasattr(self._model.vision_model.encoder.layers[0].attn, "_orig_use_flash_attn"):
+            for layer in self._model.vision_model.encoder.layers:
+                layer.attn.use_flash_attn = layer.attn._orig_use_flash_attn
+
+
+class InternVL2ChatLangModelPatcher(DecoderModelPatcher):
+    def __init__(
+        self, config: "OnnxConfig", model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Dict[str, Any]
+    ):
+        model_type = model.config.model_type
+        patcher_for_model_type = {
+            "llama": LlamaModelPatcher,
+            "qwen2": UpdateCausalMaskModelPatcher,
+            "phi3": Phi3ModelPatcher,
+            "internlm2": InternLM2Patcher,
+        }
+        self._internal_patcher = None
+        self._patched_forward = None
+        internal_patcher_cls = patcher_for_model_type.get(model_type)
+        if internal_patcher_cls is not None:
+            self._internal_patcher = internal_patcher_cls(config, model, model_kwargs)
+            self._patched_forward = self._internal_patcher.patched_forward
+        super().__init__(config, model, model_kwargs)
+
+    @property
+    def patched_forward(self):
+        if self._internal_patcher is not None:
+            return self._internal_patcher.patched_forward
+        return self._patched_forward
+
+    @patched_forward.setter
+    def patched_forward(self, fn):
+        self._patched_forward = fn
+        if self._internal_patcher is not None:
+            self._internal_patcher.patched_forward = fn
+
+    def __enter__(self):
+        if is_torch_version(">=", "2.1.0"):
+            if self._model.config.model_type == "qwen2" and self._model.config._attn_implementation != "sdpa":
+                from transformers.models.qwen2.modeling_qwen2 import QWEN2_ATTENTION_CLASSES
+
+                sdpa_attn = QWEN2_ATTENTION_CLASSES["sdpa"]
+                self._model.config._orig_attn_implementation = self._model.config._attn_implementation
+                self._model.config._attn_implementation = "sdpa"
+
+                for layer in self._model.model.layers:
+                    layer.self_attn._orig_forward = layer.self_attn.forward
+                    layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn)
+
+            if self._model.config.model_type == "llama" and self._model.config._attn_implementation != "sdpa":
+                self._model.config._orig_attn_implementation = self._model.config._attn_implementation
+                self._model.config._attn_implementation = "sdpa"
+                if is_transformers_version("<", "4.47"):
+                    from transformers.models.llama.modeling_llama import LLAMA_ATTENTION_CLASSES
+
+                    sdpa_attn = LLAMA_ATTENTION_CLASSES["sdpa"]
+                    for layer in self._model.model.layers:
+                        layer.self_attn._orig_forward = layer.self_attn.forward
+                        layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn)
+
+        if self._internal_patcher is not None:
+            return self._internal_patcher.__enter__()
+        return super().__enter__()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if self._internal_patcher:
+            self._internal_patcher.__exit__(exc_type, exc_value, traceback)
+        else:
+            super().__exit__(exc_type, exc_value, traceback)
+
+        if hasattr(self._model.config, "_orig_attn_implementation"):
+            self._model.config._attn_implementation = self._model.config._orig_attn_implementation
+            for layer in self._model.model.layers:
+                if hasattr(layer.self_attn, "_orig_forward"):
+                    layer.self_attn.forward = layer.self_attn._orig_forward
 
 
 def llava_vision_embed_forward(self, pixel_values):
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -26,7 +26,12 @@
 from optimum.configuration_utils import BaseConfig
 
 from ..utils.import_utils import is_nncf_available
-from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
+from .utils import (
+    LANGUAGE_DATASETS,
+    PREDEFINED_SD_DATASETS,
+    PREDEFINED_SPEECH_TO_TEXT_DATASETS,
+    PREDEFINED_VISUAL_LM_DATASETS,
+)
 
 
 if is_nncf_available():
@@ -467,13 +472,12 @@ def post_init(self):
                 f"If you wish to provide a custom dataset, please use the `OVQuantizer` instead."
             )
         if self.dataset is not None and isinstance(self.dataset, str):
-            lm_datasets = ["wikitext2", "c4", "c4-new", "auto"]
             visual_lm_datasets = list(PREDEFINED_VISUAL_LM_DATASETS.keys())
             stable_diffusion_datasets = list(PREDEFINED_SD_DATASETS.keys())
-            if self.dataset not in lm_datasets + visual_lm_datasets + stable_diffusion_datasets:
+            if self.dataset not in LANGUAGE_DATASETS + visual_lm_datasets + stable_diffusion_datasets:
                 raise ValueError(
                     f"""You have entered a string value for dataset. You can only choose between
-                    {lm_datasets} for LLMs, {visual_lm_datasets} for visual LLMs
+                    {LANGUAGE_DATASETS} for LLMs, {visual_lm_datasets} for visual LLMs
                     or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
                 )
 
@@ -617,7 +621,8 @@ def __init__(
             overflow_fix (`str`, default to "disable"):
                 Parameter for controlling overflow fix setting.
             dataset (`str`, *optional*):
-                The dataset used for quantization. For text-to-speech model quantization the allowed value is 'librispeech'.
+                The dataset used for quantization. For language models the allowed values are
+                ['auto', 'wikitext2','c4','c4-new']. For text-to-speech model quantization the allowed value is 'librispeech'.
             tokenizer (`str`, *optional*):
                 The tokenizer used to process the dataset. You can pass either:
                     - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
@@ -673,6 +678,14 @@ def post_init(self):
         """
         super().post_init()
 
+        if self.dataset is not None:
+            speech_to_text_datasets = list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())
+            if self.dataset not in LANGUAGE_DATASETS + speech_to_text_datasets:
+                raise ValueError(
+                    f"""You can only choose between the following datasets: {LANGUAGE_DATASETS} for LLMs or
+                    {speech_to_text_datasets} for speech-to-text models, but we found {self.dataset}."""
+                )
+
         if self.bits != 8:
             raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")
 
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
@@ -136,6 +136,8 @@
 }
 
 
+LANGUAGE_DATASETS = ["wikitext2", "c4", "c4-new", "auto"]
+
 PREDEFINED_SD_DATASETS = {
     "conceptual_captions": {"split": "train", "inputs": {"prompt": "caption"}},
     "laion/220k-GPT4Vision-captions-from-LIVIS": {"split": "train", "inputs": {"prompt": "caption"}},
diff --git a/setup.py b/setup.py
@@ -66,7 +66,7 @@
     "nncf": ["nncf>=2.14.0"],
     "openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"],
     "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
-    "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.45,<4.47", "accelerate"],
+    "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.46,<4.48", "accelerate"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -365,19 +365,21 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
                 self.assertEqual(expected_int8[i], num_weight_nodes["int8"])
 
     @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
-    def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int):
+    def test_exporters_cli_hybrid_quantization(
+        self, model_type: str, expected_fake_nodes: int, expected_int8_nodes: int
+    ):
         with TemporaryDirectory() as tmpdir:
             subprocess.run(
                 f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --dataset laion/filtered-wit --weight-format int8 {tmpdir}",
                 shell=True,
                 check=True,
             )
             model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir)
-            num_fq, num_weight_nodes = get_num_quantized_nodes(
+            num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(
                 model.unet if model.unet is not None else model.transformer
             )
-            self.assertEqual(exp_num_int8, num_weight_nodes["int8"])
-            self.assertEqual(exp_num_fq, num_fq)
+            self.assertEqual(expected_int8_nodes, num_weight_nodes["int8"])
+            self.assertEqual(expected_fake_nodes, num_fake_nodes)
 
     @parameterized.expand(TEST_4BIT_CONFIGURATIONS)
     def test_exporters_cli_4bit(
@@ -422,8 +424,8 @@ def test_exporters_cli_full_quantization(
         model_type: str,
         quant_mode: str,
         option: str,
-        expected_num_f_nodes_per_model: Tuple[int],
-        expected_num_weight_nodes_per_model: Tuple[int],
+        expected_fake_nodes: Tuple[int],
+        expected_low_precision_nodes: Tuple[int],
     ):
         with TemporaryDirectory() as tmpdir:
             subprocess.run(
@@ -439,12 +441,12 @@ def test_exporters_cli_full_quantization(
                 if model.decoder_with_past is not None:
                     models.append(model.decoder_with_past)
                 else:
-                    expected_num_f_nodes_per_model = expected_num_f_nodes_per_model[:-1]
-            self.assertEqual(len(expected_num_f_nodes_per_model), len(models))
+                    expected_fake_nodes = expected_fake_nodes[:-1]
+            self.assertEqual(len(expected_fake_nodes), len(models))
             for i, model in enumerate(models):
-                actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
-                self.assertEqual(expected_num_f_nodes_per_model[i], actual_num_f_nodes)
-                self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode])
+                num_fake_nodes, num_weight_nodes = get_num_quantized_nodes(model)
+                self.assertEqual(expected_fake_nodes[i], num_fake_nodes)
+                self.assertEqual(expected_low_precision_nodes[i], num_weight_nodes[quant_mode])
 
     def test_exporters_cli_int4_with_local_model_and_default_config(self):
         with TemporaryDirectory() as tmpdir:
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py

Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,8 @@`
`136`	`136`	`}`
`137`	`137`
`138`	`138`
	`139`	`+LANGUAGE_DATASETS = ["wikitext2", "c4", "c4-new", "auto"]`
	`140`	`+`
`139`	`141`	`PREDEFINED_SD_DATASETS = {`
`140`	`142`	`"conceptual_captions": {"split": "train", "inputs": {"prompt": "caption"}},`
`141`	`143`	`"laion/220k-GPT4Vision-captions-from-LIVIS": {"split": "train", "inputs": {"prompt": "caption"}},`