Skip to content

Commit 84e1c98

Browse files
authored
Merge branch 'main' into transformers-4.47
2 parents 63f3b8d + 7b4044d commit 84e1c98

23 files changed

+945
-270
lines changed

docs/source/openvino/export.mdx

+4-5
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Check out the help for more options:
3131

3232
```text
3333
usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
34-
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}]
34+
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}]
3535
[--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
3636
[--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
3737
[--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -67,10 +67,9 @@ Optional arguments:
6767
on your local machine arbitrary code present in the model repository.
6868
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
6969
The weight format of the exported model.
70-
--quant-mode {int8}
70+
--quant-mode {int8,f8e4m3,f8e5m2}
7171
Quantization precision mode. This is used for applying full model quantization including
72-
activations. The only currently supported choice is 'int8' for int8 quantization of both
73-
weights and activations.
72+
activations.
7473
--library {transformers,diffusers,timm,sentence_transformers,open_clip}
7574
The library used to load the model before export. If not provided, will attempt to infer the
7675
local checkpoint's library
@@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
166165
</Tip>
167166

168167

169-
Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below.
168+
Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Please see example below.
170169

171170
```bash
172171
optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo

optimum/commands/export/openvino.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
7878
optional_group.add_argument(
7979
"--quant-mode",
8080
type=str,
81-
choices=["int8"],
81+
choices=["int8", "f8e4m3", "f8e5m2"],
8282
default=None,
8383
help=(
8484
"Quantization precision mode. This is used for applying full model quantization including activations. "
85-
"The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
8685
),
8786
)
8887
optional_group.add_argument(
@@ -365,9 +364,6 @@ def run(self):
365364
quantization_config["trust_remote_code"] = self.args.trust_remote_code
366365
ov_config = OVConfig(quantization_config=quantization_config)
367366
else:
368-
if self.args.quant_mode != "int8":
369-
raise ValueError("Only 'int8' quantization mode is currently supported.")
370-
371367
quantization_config = {
372368
"weight_format": self.args.quant_mode,
373369
"activation_format": self.args.quant_mode,

optimum/exporters/ipex/cache_utils.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from typing import List, Optional, Tuple
23

34
import torch
@@ -43,8 +44,10 @@ def __init__(
4344
super().__init__()
4445
self.max_batch_size = max_batch_size
4546
# Used in `generate` to keep tally of how many tokens the cache has seen
47+
4648
self._seen_tokens = torch.zeros([max_batch_size], dtype=torch.int32, device=device)
47-
self.block_size = 16
49+
default_block_size = 16 if device.type == "cpu" else 64
50+
self.block_size = int(os.environ.get("OI_PAGED_ATTN_BLOCK_SIZE", str(default_block_size)))
4851
self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * max_batch_size
4952
self.block_tables = -1 * torch.ones([self.num_blocks], dtype=torch.int32, device=device).reshape(
5053
max_batch_size, -1

optimum/exporters/ipex/model_patcher.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from transformers.models.bert.modeling_bert import BertIntermediate
1616
from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconModel
17-
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2Model
17+
from transformers.models.gpt2.modeling_gpt2 import GPT2MLP, GPT2Attention, GPT2Block, GPT2Model
1818
from transformers.models.llama.modeling_llama import (
1919
LlamaDecoderLayer,
2020
LlamaModel,
@@ -27,6 +27,7 @@
2727

2828
from .modeling_utils import (
2929
_IPEX_MINIMUM_VERSION_FOR_PATCHING,
30+
_IPEXGPT2MLP,
3031
_falcon_model_forward,
3132
_gpt2_block_forward,
3233
_gpt2_model_forward,
@@ -111,6 +112,7 @@ def _patch_gpt2_model(model):
111112
convert_functions(model, GPT2Model, "forward", _gpt2_model_forward)
112113
convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
113114
convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.config)
115+
convert_class(model, GPT2MLP, _IPEXGPT2MLP, model.config)
114116
return model
115117

116118

0 commit comments

Comments
 (0)