Skip to content

Commit 7e20b86

Browse files
authored
Merge branch 'main' into varlen
2 parents 3fdb3a5 + a76be08 commit 7e20b86

24 files changed

+481
-111
lines changed

.github/workflows/dockerfile_sanity.yml

+4-4
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ on:
55
branches:
66
- main
77
paths:
8-
- "docker/Dockerfile.intel"
9-
8+
- 'Dockerfile.ipex'
9+
1010
pull_request:
1111
branches:
1212
- main
1313
paths:
14-
- "docker/Dockerfile.intel"
14+
- 'Dockerfile.ipex'
1515

1616
jobs:
1717
build_and_run:
@@ -27,7 +27,7 @@ jobs:
2727
- name: Build and Run Docker Image
2828
run: |
2929
IMAGE_NAME="intel_image:latest"
30-
docker build -f docker/Dockerfile.intel -t $IMAGE_NAME .
30+
docker build -f Dockerfile.ipex -t $IMAGE_NAME .
3131
if [ $? -ne 0 ]; then
3232
echo "Docker image build failed."
3333
exit 1

.github/workflows/test_openvino.yml

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: OpenVINO - Test
22

33
on:
4+
workflow_dispatch:
45
push:
56
branches:
67
- main
@@ -46,9 +47,9 @@ jobs:
4647
pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
4748
4849
- if: ${{ matrix.transformers-version != 'latest' }}
49-
name: Downgrade Transformers and Accelerate
50+
name: Install specific dependencies and versions required for older transformers
5051
run: |
51-
pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
52+
pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
5253
5354
- if: ${{ matrix.test-pattern == '*modeling*' }}
5455
name: Uninstall NNCF

.github/workflows/test_openvino_slow.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ jobs:
4646
pip uninstall -y nncf
4747
4848
- if: ${{ matrix.transformers-version != 'latest' }}
49-
name: Downgrade Transformers and Accelerate
50-
run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
49+
name: Install specific dependencies and versions required for older transformers
50+
run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
5151

5252
- name: Pip freeze
5353
run: pip freeze

Dockerfile.ipex

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
ARG PLATFORM=cpu
2+
3+
FROM ubuntu:22.04 as cpu
4+
WORKDIR /usr/src/
5+
RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
6+
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
7+
ca-certificates \
8+
git \
9+
curl \
10+
vim \
11+
build-essential \
12+
ccache \
13+
libgoogle-perftools-dev \
14+
numactl \
15+
cmake \
16+
libjpeg-dev \
17+
pybind11-dev \
18+
libpng-dev \
19+
python3 \
20+
python3-pip \
21+
&& rm -rf /var/lib/apt/lists/*"
22+
RUN /usr/sbin/update-ccache-symlinks
23+
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
24+
25+
ARG IPEX_VERSION=2.5.0
26+
ARG PYTORCH_VERSION=2.5.1
27+
ARG TORCHVISION_VERSION=0.20.1+cpu
28+
ARG TORCHAUDIO_VERSION=2.5.1+cpu
29+
30+
RUN python3 -m pip install --no-cache-dir \
31+
torch==${PYTORCH_VERSION}+cpu \
32+
torchvision==${TORCHVISION_VERSION} \
33+
torchaudio==${TORCHAUDIO_VERSION} \
34+
--index-url https://download.pytorch.org/whl/cpu && \
35+
python3 -m pip install intel-openmp -f https://download.pytorch.org/whl/torch_stable.html && \
36+
python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
37+
python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/ && \
38+
python3 -m pip install --no-cache-dir py-libnuma
39+
40+
ARG KMP_BLOCKTIME=1
41+
ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
42+
ARG KMP_HW_SUBSET=1T
43+
ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
44+
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
45+
46+
FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
47+
WORKDIR /usr/src/
48+
49+
RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
50+
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
51+
ca-certificates \
52+
git \
53+
curl \
54+
vim \
55+
ccache \
56+
libgoogle-perftools-dev \
57+
numactl \
58+
libjpeg-dev \
59+
pybind11-dev \
60+
libpng-dev \
61+
&& rm -rf /var/lib/apt/lists/*"
62+
RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
63+
64+
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
65+
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
66+
67+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils
68+
69+
FROM ${PLATFORM}
70+
71+
COPY optimum optimum
72+
COPY Makefile setup.cfg setup.py pyproject.toml README.md ./
73+
RUN pip install .

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
88

9-
[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
9+
[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations like faster attention and operators fusion.
1010

1111
Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
1212

@@ -159,7 +159,7 @@ optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
159159

160160

161161
## IPEX
162-
To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
162+
To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It will load a PyTorch checkpoint, and apply IPEX operators optimization (replaced with customized IPEX operators).
163163
```diff
164164
from transformers import AutoTokenizer, pipeline
165165
- from transformers import AutoModelForCausalLM
@@ -168,7 +168,7 @@ To load your IPEX model, you can just replace your `AutoModelForXxx` class with
168168

169169
model_id = "gpt2"
170170
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
171-
+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
171+
+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
172172
tokenizer = AutoTokenizer.from_pretrained(model_id)
173173
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
174174
results = pipe("He's a dreadful magician and")

docker/Dockerfile.intel

-53
This file was deleted.

docs/source/ipex/inference.mdx

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Optimum Intel can be used to load models from the [Hub](https://huggingface.co/m
1414

1515
## Loading
1616

17-
You can load your model and apply IPEX optimizations (apply torch.compile for non-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
17+
You can load your model and apply IPEX optimizations (apply torch.compile except text-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
1818
For now, support is enabled for Intel CPU/GPU. Previous models converted to TorchScript will be deprecated in v1.22.
1919

2020
```diff
@@ -43,3 +43,4 @@ As shown in the table below, each task is associated with a class enabling to au
4343
| `IPEXModelForMaskedLM` | `fill-mask` |
4444
| `IPEXModelForAudioClassification` | `audio-classification` |
4545
| `IPEXModelForCausalLM` | `text-generation` |
46+
| `IPEXModelForSeq2SeqLM` | `text2text-generation` |

docs/source/ipex/models.mdx

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ Here is the list of the supported architectures :
4040
- Roberta
4141
- Roformer
4242
- SqueezeBert
43+
- T5
4344
- UniSpeech
4445
- Vit
4546
- Wav2Vec2

docs/source/openvino/export.mdx

+5-2
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ Optional arguments:
7878
--ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit
7979
quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be
8080
quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size
81-
and inference latency. Default value is 1.0.
81+
and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is
82+
less than 1.0, then data-aware mixed precision assignment will be applied.
8283
--sym Whether to apply symmetric quantization
8384
--group-size GROUP_SIZE
8485
The group size to use for quantization. Recommended value is 128 and -1 uses per-column
@@ -94,7 +95,9 @@ Optional arguments:
9495
can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
9596
be collected from model's generations. For diffusion models it should be on of
9697
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
97-
visual language models the dataset must be set to 'contextual'.
98+
visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware
99+
compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset
100+
argument will not have an effect on the resulting model.
98101
--all-layers Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
99102
weight compression is applied, they are compressed to INT8.
100103
--awq Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but

notebooks/ipex/text_generation.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"cell_type": "markdown",
1212
"metadata": {},
1313
"source": [
14-
"To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model."
14+
"To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It could apply IPEX, providing optimizations like faster attention and operators fusion."
1515
]
1616
},
1717
{
@@ -60,7 +60,7 @@
6060
}
6161
],
6262
"source": [
63-
"model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",
63+
"model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",
6464
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
6565
"input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
6666
"model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",

optimum/commands/export/openvino.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
102102
default=None,
103103
help=(
104104
"A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
105-
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
105+
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
106+
"Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
106107
),
107108
)
108109
optional_group.add_argument(
@@ -140,7 +141,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
140141
"dataset will be collected from model's generations. "
141142
"For diffusion models it should be on of ['conceptual_captions',"
142143
"'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
143-
"For visual language models the dataset must be set to 'contextual'."
144+
"For visual language models the dataset must be set to 'contextual'. "
145+
"Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
146+
"equals 1.0, the dataset argument will not have an effect on the resulting model."
144147
),
145148
)
146149
optional_group.add_argument(

optimum/exporters/openvino/__main__.py

-3
Original file line numberDiff line numberDiff line change
@@ -474,9 +474,6 @@ class StoreAttr(object):
474474
from optimum.intel.openvino.quantization import _weight_only_quantization
475475

476476
_weight_only_quantization(submodel, quantization_config)
477-
if "text-generation" in task:
478-
submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"])
479-
480477
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
481478
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
482479
del submodel

optimum/exporters/openvino/model_configs.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1804,8 +1804,9 @@ def __init__(
18041804
normalized_config: NormalizedVisionConfig,
18051805
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
18061806
num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
1807-
width: int = DEFAULT_DUMMY_SHAPES["width"],
1808-
height: int = DEFAULT_DUMMY_SHAPES["height"],
1807+
width: int = DEFAULT_DUMMY_SHAPES["width"] // 4,
1808+
height: int = DEFAULT_DUMMY_SHAPES["height"] // 4,
1809+
# Reduce img shape by 4 for FLUX to reduce memory usage on conversion
18091810
**kwargs,
18101811
):
18111812
super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs)

optimum/intel/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,10 @@
5151
"IPEXModel",
5252
]
5353
else:
54+
_import_structure["utils.dummy_ipex_objects"] = []
5455
_import_structure["ipex"] = [
5556
"IPEXModelForCausalLM",
57+
"IPEXModelForSeq2SeqLM",
5658
"IPEXModelForSequenceClassification",
5759
"IPEXModelForMaskedLM",
5860
"IPEXModelForTokenClassification",
@@ -247,6 +249,7 @@
247249
IPEXModelForImageClassification,
248250
IPEXModelForMaskedLM,
249251
IPEXModelForQuestionAnswering,
252+
IPEXModelForSeq2SeqLM,
250253
IPEXModelForSequenceClassification,
251254
IPEXModelForTokenClassification,
252255
)

optimum/intel/ipex/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
IPEXModelForImageClassification,
2121
IPEXModelForMaskedLM,
2222
IPEXModelForQuestionAnswering,
23+
IPEXModelForSeq2SeqLM,
2324
IPEXModelForSequenceClassification,
2425
IPEXModelForTokenClassification,
2526
)

0 commit comments

Comments
 (0)