Skip to content

Commit f094cad

Browse files
authored
Merge branch 'main' into quant
2 parents d91eefb + e465c7f commit f094cad

27 files changed

+1045
-321
lines changed

.github/workflows/build_pr_documentation.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
echo ${{ env.COMMIT_SHA }} > ./commit_sha
6161
echo ${{ env.PR_NUMBER }} > ./pr_number
6262
63-
- uses: actions/upload-artifact@v3
63+
- uses: actions/upload-artifact@v4
6464
with:
6565
name: doc-build-artifact
6666
path: optimum-intel/intel-doc-build/

.github/workflows/test_ipex.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
strategy:
1919
fail-fast: false
2020
matrix:
21-
transformers-version: ["4.46.0", "4.46.3"]
21+
transformers-version: ["4.47.0", "4.47.1"]
2222
torch-version: ["2.4.0", "2.5.*"]
2323

2424
runs-on: ubuntu-22.04

Dockerfile.ipex

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ ARG KMP_HW_SUBSET=1T
4343
ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
4444
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
4545

46-
FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
46+
FROM intel/intel-extension-for-pytorch:2.5.10-xpu as xpu
4747
WORKDIR /usr/src/
4848

4949
RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \

docs/source/openvino/export.mdx

+4-5
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Check out the help for more options:
3131

3232
```text
3333
usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
34-
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}]
34+
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}]
3535
[--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
3636
[--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
3737
[--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -67,10 +67,9 @@ Optional arguments:
6767
on your local machine arbitrary code present in the model repository.
6868
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
6969
The weight format of the exported model.
70-
--quant-mode {int8}
70+
--quant-mode {int8,f8e4m3,f8e5m2}
7171
Quantization precision mode. This is used for applying full model quantization including
72-
activations. The only currently supported choice is 'int8' for int8 quantization of both
73-
weights and activations.
72+
activations.
7473
--library {transformers,diffusers,timm,sentence_transformers,open_clip}
7574
The library used to load the model before export. If not provided, will attempt to infer the
7675
local checkpoint's library
@@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
166165
</Tip>
167166

168167

169-
Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below.
168+
Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Please see example below.
170169

171170
```bash
172171
optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo

optimum/commands/export/openvino.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
7878
optional_group.add_argument(
7979
"--quant-mode",
8080
type=str,
81-
choices=["int8"],
81+
choices=["int8", "f8e4m3", "f8e5m2"],
8282
default=None,
8383
help=(
8484
"Quantization precision mode. This is used for applying full model quantization including activations. "
85-
"The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
8685
),
8786
)
8887
optional_group.add_argument(
@@ -365,8 +364,10 @@ def run(self):
365364
quantization_config["trust_remote_code"] = self.args.trust_remote_code
366365
ov_config = OVConfig(quantization_config=quantization_config)
367366
else:
368-
if self.args.quant_mode != "int8":
369-
raise ValueError("Only 'int8' quantization mode is currently supported.")
367+
if self.args.dataset is None:
368+
raise ValueError(
369+
"Dataset is required for full quantization. Please provide it with --dataset argument."
370+
)
370371

371372
quantization_config = {
372373
"weight_format": self.args.quant_mode,

optimum/exporters/ipex/cache_utils.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from typing import List, Optional, Tuple
23

34
import torch
@@ -33,21 +34,23 @@ class IPEXPagedCache(Cache):
3334
def __init__(
3435
self,
3536
config: PretrainedConfig,
36-
batch_size: int,
37+
max_batch_size: int,
3738
max_cache_len: int,
3839
device,
3940
dtype=None,
4041
layer_device_map=None,
4142
**kwargs,
4243
) -> None:
4344
super().__init__()
44-
self.batch_size = batch_size
45+
self.max_batch_size = max_batch_size
4546
# Used in `generate` to keep tally of how many tokens the cache has seen
46-
self._seen_tokens = torch.zeros([batch_size], dtype=torch.int32, device=device)
47-
self.block_size = 16
48-
self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * batch_size
47+
48+
self._seen_tokens = torch.zeros([max_batch_size], dtype=torch.int32, device=device)
49+
default_block_size = 16 if device.type == "cpu" else 64
50+
self.block_size = int(os.environ.get("OI_PAGED_ATTN_BLOCK_SIZE", str(default_block_size)))
51+
self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * max_batch_size
4952
self.block_tables = -1 * torch.ones([self.num_blocks], dtype=torch.int32, device=device).reshape(
50-
batch_size, -1
53+
max_batch_size, -1
5154
)
5255
self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=device)
5356
self.max_cache_len = max_cache_len
@@ -191,7 +194,7 @@ def get_max_length(self) -> Optional[int]:
191194

192195
def reset(self):
193196
"""Resets the cache values while preserving the objects"""
194-
self._seen_tokens = torch.zeros([self.batch_size], dtype=torch.int32, device=self.block_tables.device)
197+
self._seen_tokens = torch.zeros([self.max_batch_size], dtype=torch.int32, device=self.block_tables.device)
195198
self.block_tables.fill_(-1)
196199
self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=self.block_tables.device)
197200
self.max_seq_len = 0

optimum/exporters/ipex/model_patcher.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from transformers.models.bert.modeling_bert import BertIntermediate
1616
from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconModel
17-
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2Model
17+
from transformers.models.gpt2.modeling_gpt2 import GPT2MLP, GPT2Attention, GPT2Block, GPT2Model
1818
from transformers.models.llama.modeling_llama import (
1919
LlamaDecoderLayer,
2020
LlamaModel,
@@ -27,6 +27,7 @@
2727

2828
from .modeling_utils import (
2929
_IPEX_MINIMUM_VERSION_FOR_PATCHING,
30+
_IPEXGPT2MLP,
3031
_falcon_model_forward,
3132
_gpt2_block_forward,
3233
_gpt2_model_forward,
@@ -40,8 +41,8 @@
4041

4142

4243
# Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
43-
_TRANSFORMERS_MIN_VERSION = "4.46.0"
44-
_TRANSFORMERS_MAX_VERSION = "4.46.99"
44+
_TRANSFORMERS_MIN_VERSION = "4.47.0"
45+
_TRANSFORMERS_MAX_VERSION = "4.47.99"
4546

4647
_IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)
4748

@@ -111,6 +112,7 @@ def _patch_gpt2_model(model):
111112
convert_functions(model, GPT2Model, "forward", _gpt2_model_forward)
112113
convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
113114
convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.device, model.config)
115+
convert_class(model, GPT2MLP, _IPEXGPT2MLP, model.device, model.config)
114116
return model
115117

116118

0 commit comments

Comments
 (0)