Skip to content

Commit 9a7e931

Browse files
authored
Merge branch 'main' into quant
2 parents 87656ca + a76be08 commit 9a7e931

18 files changed

+161
-85
lines changed

.github/workflows/dockerfile_sanity.yml

+4-4
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ on:
55
branches:
66
- main
77
paths:
8-
- "docker/Dockerfile.intel"
9-
8+
- 'Dockerfile.ipex'
9+
1010
pull_request:
1111
branches:
1212
- main
1313
paths:
14-
- "docker/Dockerfile.intel"
14+
- 'Dockerfile.ipex'
1515

1616
jobs:
1717
build_and_run:
@@ -27,7 +27,7 @@ jobs:
2727
- name: Build and Run Docker Image
2828
run: |
2929
IMAGE_NAME="intel_image:latest"
30-
docker build -f docker/Dockerfile.intel -t $IMAGE_NAME .
30+
docker build -f Dockerfile.ipex -t $IMAGE_NAME .
3131
if [ $? -ne 0 ]; then
3232
echo "Docker image build failed."
3333
exit 1

.github/workflows/test_openvino.yml

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: OpenVINO - Test
22

33
on:
4+
workflow_dispatch:
45
push:
56
branches:
67
- main
@@ -46,9 +47,9 @@ jobs:
4647
pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
4748
4849
- if: ${{ matrix.transformers-version != 'latest' }}
49-
name: Downgrade Transformers and Accelerate
50+
name: Install specific dependencies and versions required for older transformers
5051
run: |
51-
pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
52+
pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
5253
5354
- if: ${{ matrix.test-pattern == '*modeling*' }}
5455
name: Uninstall NNCF

.github/workflows/test_openvino_slow.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ jobs:
4646
pip uninstall -y nncf
4747
4848
- if: ${{ matrix.transformers-version != 'latest' }}
49-
name: Downgrade Transformers and Accelerate
50-
run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
49+
name: Install specific dependencies and versions required for older transformers
50+
run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
5151

5252
- name: Pip freeze
5353
run: pip freeze

Dockerfile.ipex

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
ARG PLATFORM=cpu
2+
3+
FROM ubuntu:22.04 as cpu
4+
WORKDIR /usr/src/
5+
RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
6+
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
7+
ca-certificates \
8+
git \
9+
curl \
10+
vim \
11+
build-essential \
12+
ccache \
13+
libgoogle-perftools-dev \
14+
numactl \
15+
cmake \
16+
libjpeg-dev \
17+
pybind11-dev \
18+
libpng-dev \
19+
python3 \
20+
python3-pip \
21+
&& rm -rf /var/lib/apt/lists/*"
22+
RUN /usr/sbin/update-ccache-symlinks
23+
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
24+
25+
ARG IPEX_VERSION=2.5.0
26+
ARG PYTORCH_VERSION=2.5.1
27+
ARG TORCHVISION_VERSION=0.20.1+cpu
28+
ARG TORCHAUDIO_VERSION=2.5.1+cpu
29+
30+
RUN python3 -m pip install --no-cache-dir \
31+
torch==${PYTORCH_VERSION}+cpu \
32+
torchvision==${TORCHVISION_VERSION} \
33+
torchaudio==${TORCHAUDIO_VERSION} \
34+
--index-url https://download.pytorch.org/whl/cpu && \
35+
python3 -m pip install intel-openmp -f https://download.pytorch.org/whl/torch_stable.html && \
36+
python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
37+
python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/ && \
38+
python3 -m pip install --no-cache-dir py-libnuma
39+
40+
ARG KMP_BLOCKTIME=1
41+
ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
42+
ARG KMP_HW_SUBSET=1T
43+
ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
44+
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
45+
46+
FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
47+
WORKDIR /usr/src/
48+
49+
RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
50+
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
51+
ca-certificates \
52+
git \
53+
curl \
54+
vim \
55+
ccache \
56+
libgoogle-perftools-dev \
57+
numactl \
58+
libjpeg-dev \
59+
pybind11-dev \
60+
libpng-dev \
61+
&& rm -rf /var/lib/apt/lists/*"
62+
RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
63+
64+
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
65+
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
66+
67+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils
68+
69+
FROM ${PLATFORM}
70+
71+
COPY optimum optimum
72+
COPY Makefile setup.cfg setup.py pyproject.toml README.md ./
73+
RUN pip install .

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
88

9-
[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
9+
[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations like faster attention and operators fusion.
1010

1111
Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
1212

@@ -159,7 +159,7 @@ optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
159159

160160

161161
## IPEX
162-
To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
162+
To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It will load a PyTorch checkpoint, and apply IPEX operators optimization (replaced with customized IPEX operators).
163163
```diff
164164
from transformers import AutoTokenizer, pipeline
165165
- from transformers import AutoModelForCausalLM

docker/Dockerfile.intel

-53
This file was deleted.

docs/source/openvino/export.mdx

+5-2
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ Optional arguments:
7878
--ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit
7979
quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be
8080
quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size
81-
and inference latency. Default value is 1.0.
81+
and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is
82+
less than 1.0, then data-aware mixed precision assignment will be applied.
8283
--sym Whether to apply symmetric quantization
8384
--group-size GROUP_SIZE
8485
The group size to use for quantization. Recommended value is 128 and -1 uses per-column
@@ -94,7 +95,9 @@ Optional arguments:
9495
can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
9596
be collected from model's generations. For diffusion models it should be on of
9697
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
97-
visual language models the dataset must be set to 'contextual'.
98+
visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware
99+
compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset
100+
argument will not have an effect on the resulting model.
98101
--all-layers Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
99102
weight compression is applied, they are compressed to INT8.
100103
--awq Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but

notebooks/ipex/text_generation.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"cell_type": "markdown",
1212
"metadata": {},
1313
"source": [
14-
"To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model."
14+
"To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It could apply IPEX, providing optimizations like faster attention and operators fusion."
1515
]
1616
},
1717
{
@@ -60,7 +60,7 @@
6060
}
6161
],
6262
"source": [
63-
"model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",
63+
"model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",
6464
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
6565
"input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
6666
"model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",

optimum/commands/export/openvino.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
102102
default=None,
103103
help=(
104104
"A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
105-
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
105+
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
106+
"Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
106107
),
107108
)
108109
optional_group.add_argument(
@@ -140,7 +141,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
140141
"dataset will be collected from model's generations. "
141142
"For diffusion models it should be on of ['conceptual_captions',"
142143
"'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
143-
"For visual language models the dataset must be set to 'contextual'."
144+
"For visual language models the dataset must be set to 'contextual'. "
145+
"Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
146+
"equals 1.0, the dataset argument will not have an effect on the resulting model."
144147
),
145148
)
146149
optional_group.add_argument(

optimum/exporters/ipex/modeling_utils.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def _llama_model_forward(
207207
position_ids = torch.arange(
208208
past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
209209
)
210-
position_ids = position_ids.unsqueeze(0)
210+
position_ids = position_ids.unsqueeze(0).repeat_interleave(input_ids.shape[0], 0)
211211

212212
if inputs_embeds is None:
213213
inputs_embeds = self.embed_tokens(input_ids)
@@ -324,7 +324,7 @@ def _falcon_model_forward(
324324
)
325325

326326
if position_ids is None:
327-
position_ids = cache_position.unsqueeze(0)
327+
position_ids = cache_position.unsqueeze(0).repeat_interleave(input_ids.shape[0], 0)
328328

329329
# Prepare head mask if needed
330330
# 1.0 in head_mask indicate we keep the head
@@ -446,7 +446,7 @@ def _gpt2_model_forward(
446446
past_length = past_key_values.get_seq_length() if past_key_values is not None else 0
447447
if position_ids is None:
448448
position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
449-
position_ids = position_ids.unsqueeze(0)
449+
position_ids = position_ids.unsqueeze(0).repeat_interleave(input_ids.shape[0], 0)
450450

451451
if inputs_embeds is None:
452452
inputs_embeds = self.wte(input_ids)

optimum/exporters/openvino/__main__.py

-3
Original file line numberDiff line numberDiff line change
@@ -474,9 +474,6 @@ class StoreAttr(object):
474474
from optimum.intel.openvino.quantization import _weight_only_quantization
475475

476476
_weight_only_quantization(submodel, quantization_config)
477-
if "text-generation" in task:
478-
submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"])
479-
480477
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
481478
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
482479
del submodel

optimum/exporters/openvino/model_configs.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1804,8 +1804,9 @@ def __init__(
18041804
normalized_config: NormalizedVisionConfig,
18051805
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
18061806
num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
1807-
width: int = DEFAULT_DUMMY_SHAPES["width"],
1808-
height: int = DEFAULT_DUMMY_SHAPES["height"],
1807+
width: int = DEFAULT_DUMMY_SHAPES["width"] // 4,
1808+
height: int = DEFAULT_DUMMY_SHAPES["height"] // 4,
1809+
# Reduce img shape by 4 for FLUX to reduce memory usage on conversion
18091810
**kwargs,
18101811
):
18111812
super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs)

optimum/intel/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
"IPEXModel",
5252
]
5353
else:
54+
_import_structure["utils.dummy_ipex_objects"] = []
5455
_import_structure["ipex"] = [
5556
"IPEXModelForCausalLM",
5657
"IPEXModelForSeq2SeqLM",

optimum/intel/ipex/modeling_base.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
_IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation")
6363
_IPEX_MINIMUM_VERSION_FOR_COMPILE = "2.5.0"
6464
# TODO: Some models are already fixed in torch 2.6, will enable them when torch upgrading to 2.6
65-
_COMPILE_NOT_READY_MODEL_TYPES = ("electra", "roformer", "beit", "llama", "falcon", "gpt2")
65+
_COMPILE_NOT_READY_MODEL_TYPES = ("electra", "roformer", "gpt_neox", "beit", "llama", "falcon", "gpt2")
6666

6767

6868
def _is_patched_with_ipex(model, task, use_cache: bool = True):
@@ -291,14 +291,16 @@ def forward(
291291
attention_mask: Optional[torch.FloatTensor] = None,
292292
**kwargs,
293293
) -> CausalLMOutputWithPast:
294+
if self.add_patch and input_ids is not None and attention_mask is None:
295+
attention_mask = torch.ones_like(input_ids)
294296
return self.model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
295297

296298
def _prepare_generation_config(
297299
self, generation_config: Optional[GenerationConfig], **kwargs: Dict
298300
) -> Tuple[GenerationConfig, Dict]:
299301
generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
300302
generation_method = generation_config.get_generation_mode().value
301-
if self.compiled and generation_config.cache_implementation != "ipex_paged":
303+
if self.compiled and generation_config.cache_implementation != "ipex_paged" and self._supports_static_cache:
302304
# Use static cache for torch compile
303305
generation_config.cache_implementation = "static"
304306
if generation_method not in _IPEX_EXPORTED_GENERATION_METHODS:

optimum/intel/openvino/configuration.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
344344
ratio (`float`, defaults to 1.0):
345345
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
346346
and the rest to INT8_ASYM).
347+
Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment
348+
will be applied.
347349
all_layers (`bool`, *optional*):
348350
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision.
349351
sensitivity_metric (`str`, *optional*):
@@ -441,7 +443,7 @@ def post_init(self):
441443
Safety checker that arguments are correct
442444
"""
443445
super().post_init()
444-
if self.ratio is not None and not (0 <= self.ratio <= 1):
446+
if not (0 <= self.ratio <= 1):
445447
raise ValueError("`ratio` must between 0 and 1.")
446448
if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
447449
raise ValueError("`group_size` must be greater than 0 or equal to -1")
@@ -461,6 +463,18 @@ def post_init(self):
461463
or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
462464
)
463465

466+
if self.dataset is not None and not (
467+
self.quant_method == OVQuantizationMethod.AWQ
468+
or self.scale_estimation
469+
or self.gptq
470+
or self.lora_correction
471+
or (self.ratio < 1.0 and self.sensitivity_metric != nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR)
472+
):
473+
logger.warning(
474+
"The provided dataset won't have any effect on the resulting compressed model because no data-aware "
475+
"quantization algorithm is selected and compression ratio is 1.0."
476+
)
477+
464478
if self.bits not in [4, 8]:
465479
raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")
466480

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
"nncf": ["nncf>=2.14.0"],
6767
"openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"],
6868
"neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
69-
"ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.45,<4.47"],
69+
"ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.45,<4.47", "accelerate"],
7070
"diffusers": ["diffusers"],
7171
"quality": QUALITY_REQUIRE,
7272
"tests": TESTS_REQUIRE,

0 commit comments

Comments
 (0)