Merge branch 'main' into quant

jiqing-feng · web-flow · commit 9a7e931670f2 · 2024-12-18T12:56:24.000+08:00
diff --git a/.github/workflows/dockerfile_sanity.yml b/.github/workflows/dockerfile_sanity.yml
@@ -5,13 +5,13 @@ on:
     branches:
       - main
     paths:
-      - "docker/Dockerfile.intel"
-
+      - 'Dockerfile.ipex'
+ 
   pull_request:
     branches:
       - main
     paths:
-      - "docker/Dockerfile.intel"
+      - 'Dockerfile.ipex'
 
 jobs:
   build_and_run:
@@ -27,7 +27,7 @@ jobs:
       - name: Build and Run Docker Image
         run: |
           IMAGE_NAME="intel_image:latest"
-          docker build -f docker/Dockerfile.intel -t $IMAGE_NAME .
+          docker build -f Dockerfile.ipex -t $IMAGE_NAME .
           if [ $? -ne 0 ]; then
             echo "Docker image build failed."
             exit 1
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
@@ -1,6 +1,7 @@
 name: OpenVINO - Test
 
 on:
+  workflow_dispatch:
   push:
     branches:
       - main
@@ -46,9 +47,9 @@ jobs:
           pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
 
       - if: ${{ matrix.transformers-version != 'latest' }}
-        name: Downgrade Transformers and Accelerate
+        name: Install specific dependencies and versions required for older transformers
         run: |
-          pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
+          pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
 
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Uninstall NNCF
diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml
@@ -46,8 +46,8 @@ jobs:
           pip uninstall -y nncf
 
       - if: ${{ matrix.transformers-version != 'latest' }}
-        name: Downgrade Transformers and Accelerate
-        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
+        name: Install specific dependencies and versions required for older transformers
+        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
 
       - name: Pip freeze
         run: pip freeze
diff --git a/Dockerfile.ipex b/Dockerfile.ipex
@@ -0,0 +1,73 @@
+ARG PLATFORM=cpu
+
+FROM ubuntu:22.04 as cpu
+WORKDIR /usr/src/
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    vim \
+    build-essential \
+    ccache \
+    libgoogle-perftools-dev \
+    numactl \
+    cmake \
+    libjpeg-dev \
+    pybind11-dev \
+    libpng-dev \
+    python3 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*"
+RUN /usr/sbin/update-ccache-symlinks
+RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
+
+ARG IPEX_VERSION=2.5.0
+ARG PYTORCH_VERSION=2.5.1
+ARG TORCHVISION_VERSION=0.20.1+cpu
+ARG TORCHAUDIO_VERSION=2.5.1+cpu
+
+RUN python3 -m pip install --no-cache-dir \
+    torch==${PYTORCH_VERSION}+cpu \
+    torchvision==${TORCHVISION_VERSION} \
+    torchaudio==${TORCHAUDIO_VERSION} \
+    --index-url https://download.pytorch.org/whl/cpu && \
+    python3 -m pip install intel-openmp -f https://download.pytorch.org/whl/torch_stable.html && \
+    python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
+    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/ && \
+    python3 -m pip install --no-cache-dir  py-libnuma
+
+ARG KMP_BLOCKTIME=1
+ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
+ARG KMP_HW_SUBSET=1T
+ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
+
+FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
+WORKDIR /usr/src/
+
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    vim \
+    ccache \
+    libgoogle-perftools-dev \
+    numactl \
+    libjpeg-dev \
+    pybind11-dev \
+    libpng-dev \
+    && rm -rf /var/lib/apt/lists/*"
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils
+
+FROM ${PLATFORM}
+
+COPY optimum optimum
+COPY Makefile setup.cfg setup.py pyproject.toml README.md ./
+RUN pip install .
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
 
-[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
+[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations like faster attention and operators fusion.
 
 Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
 
@@ -159,7 +159,7 @@ optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
 
 
 ## IPEX
-To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
+To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It will load a PyTorch checkpoint, and apply IPEX operators optimization (replaced with customized IPEX operators).
 ```diff
   from transformers import AutoTokenizer, pipeline
 - from transformers import AutoModelForCausalLM
diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel
diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
@@ -78,7 +78,8 @@ Optional arguments:
   --ratio RATIO         A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit
                         quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be
                         quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size
-                        and inference latency. Default value is 1.0.
+                        and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is
+                        less than 1.0, then data-aware mixed precision assignment will be applied.
   --sym                 Whether to apply symmetric quantization
   --group-size GROUP_SIZE
                         The group size to use for quantization. Recommended value is 128 and -1 uses per-column
@@ -94,7 +95,9 @@ Optional arguments:
                         can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
                         be collected from model's generations. For diffusion models it should be on of
                         ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
-                        visual language models the dataset must be set to 'contextual'.
+                        visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware
+                        compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset
+                        argument will not have an effect on the resulting model.
   --all-layers          Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
                         weight compression is applied, they are compressed to INT8.
   --awq                 Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but
diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb
@@ -11,7 +11,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model."
+    "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It could apply IPEX, providing optimizations like faster attention and operators fusion."
    ]
   },
   {
@@ -60,7 +60,7 @@
     }
    ],
    "source": [
-    "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",
+    "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
     "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
     "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -102,7 +102,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=(
             "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
-            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
+            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
+            "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
         ),
     )
     optional_group.add_argument(
@@ -140,7 +141,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "dataset will be collected from model's generations. "
             "For diffusion models it should be on of ['conceptual_captions',"
             "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
-            "For visual language models the dataset must be set to 'contextual'."
+            "For visual language models the dataset must be set to 'contextual'. "
+            "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
+            "equals 1.0, the dataset argument will not have an effect on the resulting model."
         ),
     )
     optional_group.add_argument(
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
@@ -207,7 +207,7 @@ def _llama_model_forward(
         position_ids = torch.arange(
             past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
         )
-        position_ids = position_ids.unsqueeze(0)
+        position_ids = position_ids.unsqueeze(0).repeat_interleave(input_ids.shape[0], 0)
 
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
@@ -324,7 +324,7 @@ def _falcon_model_forward(
         )
 
     if position_ids is None:
-        position_ids = cache_position.unsqueeze(0)
+        position_ids = cache_position.unsqueeze(0).repeat_interleave(input_ids.shape[0], 0)
 
     # Prepare head mask if needed
     # 1.0 in head_mask indicate we keep the head
@@ -446,7 +446,7 @@ def _gpt2_model_forward(
     past_length = past_key_values.get_seq_length() if past_key_values is not None else 0
     if position_ids is None:
         position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-        position_ids = position_ids.unsqueeze(0)
+        position_ids = position_ids.unsqueeze(0).repeat_interleave(input_ids.shape[0], 0)
 
     if inputs_embeds is None:
         inputs_embeds = self.wte(input_ids)
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -474,9 +474,6 @@ class StoreAttr(object):
         from optimum.intel.openvino.quantization import _weight_only_quantization
 
         _weight_only_quantization(submodel, quantization_config)
-        if "text-generation" in task:
-            submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"])
-
         compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
         save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
         del submodel
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -1804,8 +1804,9 @@ def __init__(
         normalized_config: NormalizedVisionConfig,
         batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
         num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
-        width: int = DEFAULT_DUMMY_SHAPES["width"],
-        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"] // 4,
+        height: int = DEFAULT_DUMMY_SHAPES["height"] // 4,
+        # Reduce img shape by 4 for FLUX to reduce memory usage on conversion
         **kwargs,
     ):
         super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs)
diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
@@ -51,6 +51,7 @@
         "IPEXModel",
     ]
 else:
+    _import_structure["utils.dummy_ipex_objects"] = []
     _import_structure["ipex"] = [
         "IPEXModelForCausalLM",
         "IPEXModelForSeq2SeqLM",
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -62,7 +62,7 @@
 _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation")
 _IPEX_MINIMUM_VERSION_FOR_COMPILE = "2.5.0"
 # TODO: Some models are already fixed in torch 2.6, will enable them when torch upgrading to 2.6
-_COMPILE_NOT_READY_MODEL_TYPES = ("electra", "roformer", "beit", "llama", "falcon", "gpt2")
+_COMPILE_NOT_READY_MODEL_TYPES = ("electra", "roformer", "gpt_neox", "beit", "llama", "falcon", "gpt2")
 
 
 def _is_patched_with_ipex(model, task, use_cache: bool = True):
@@ -291,14 +291,16 @@ def forward(
         attention_mask: Optional[torch.FloatTensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
+        if self.add_patch and input_ids is not None and attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
         return self.model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
 
     def _prepare_generation_config(
         self, generation_config: Optional[GenerationConfig], **kwargs: Dict
     ) -> Tuple[GenerationConfig, Dict]:
         generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
         generation_method = generation_config.get_generation_mode().value
-        if self.compiled and generation_config.cache_implementation != "ipex_paged":
+        if self.compiled and generation_config.cache_implementation != "ipex_paged" and self._supports_static_cache:
             # Use static cache for torch compile
             generation_config.cache_implementation = "static"
         if generation_method not in _IPEX_EXPORTED_GENERATION_METHODS:
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -344,6 +344,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
         ratio (`float`, defaults to 1.0):
             The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
             and the rest to INT8_ASYM).
+            Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment
+            will be applied.
         all_layers (`bool`, *optional*):
             Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision.
         sensitivity_metric (`str`, *optional*):
@@ -441,7 +443,7 @@ def post_init(self):
         Safety checker that arguments are correct
         """
         super().post_init()
-        if self.ratio is not None and not (0 <= self.ratio <= 1):
+        if not (0 <= self.ratio <= 1):
             raise ValueError("`ratio` must between 0 and 1.")
         if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
             raise ValueError("`group_size` must be greater than 0 or equal to -1")
@@ -461,6 +463,18 @@ def post_init(self):
                     or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
                 )
 
+        if self.dataset is not None and not (
+            self.quant_method == OVQuantizationMethod.AWQ
+            or self.scale_estimation
+            or self.gptq
+            or self.lora_correction
+            or (self.ratio < 1.0 and self.sensitivity_metric != nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR)
+        ):
+            logger.warning(
+                "The provided dataset won't have any effect on the resulting compressed model because no data-aware "
+                "quantization algorithm is selected and compression ratio is 1.0."
+            )
+
         if self.bits not in [4, 8]:
             raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")
 
diff --git a/setup.py b/setup.py
@@ -66,7 +66,7 @@
     "nncf": ["nncf>=2.14.0"],
     "openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"],
     "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
-    "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.45,<4.47"],
+    "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.45,<4.47", "accelerate"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`"cell_type": "markdown",`
`12`	`12`	`"metadata": {},`
`13`	`13`	`"source": [`
`14`		- "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model."
	`14`	+ "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It could apply IPEX, providing optimizations like faster attention and operators fusion."
`15`	`15`	`]`
`16`	`16`	`},`
`17`	`17`	`{`
`@@ -60,7 +60,7 @@`
`60`	`60`	`}`
`61`	`61`	`],`
`62`	`62`	`"source": [`
`63`		`- "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",`
	`63`	`+ "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",`
`64`	`64`	`"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",`
`65`	`65`	`"input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",`
`66`	`66`	`"model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",`
Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,7 @@`
`51`	`51`	`"IPEXModel",`
`52`	`52`	`]`
`53`	`53`	`else:`
	`54`	`+ _import_structure["utils.dummy_ipex_objects"] = []`
`54`	`55`	`_import_structure["ipex"] = [`
`55`	`56`	`"IPEXModelForCausalLM",`
`56`	`57`	`"IPEXModelForSeq2SeqLM",`