huggingface
diff --git a/‎.github/workflows/build_pr_documentation.yml
+1-1 b/‎.github/workflows/build_pr_documentation.yml
+1-1
diff --git a/‎.github/workflows/test_ipex.yml
+1-1 b/‎.github/workflows/test_ipex.yml
+1-1
diff --git a/‎Dockerfile.ipex
+1-1 b/‎Dockerfile.ipex
+1-1
diff --git a/‎docs/source/openvino/export.mdx
+4-5 b/‎docs/source/openvino/export.mdx
+4-5
diff --git a/‎optimum/commands/export/openvino.py
+5-4 b/‎optimum/commands/export/openvino.py
+5-4
diff --git a/‎optimum/exporters/ipex/cache_utils.py
+10-7 b/‎optimum/exporters/ipex/cache_utils.py
+10-7
diff --git a/‎optimum/exporters/ipex/model_patcher.py
+5-3 b/‎optimum/exporters/ipex/model_patcher.py
+5-3
@@ -60,7 +60,7 @@ jobs:
           echo ${{ env.COMMIT_SHA }} > ./commit_sha
           echo ${{ env.PR_NUMBER }} > ./pr_number
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: doc-build-artifact
           path: optimum-intel/intel-doc-build/
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        transformers-version: ["4.46.0", "4.46.3"]
+        transformers-version: ["4.47.0", "4.47.1"]
         torch-version: ["2.4.0", "2.5.*"]
 
     runs-on: ubuntu-22.04
 
@@ -43,7 +43,7 @@ ARG KMP_HW_SUBSET=1T
 ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
 
-FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
+FROM intel/intel-extension-for-pytorch:2.5.10-xpu as xpu
 WORKDIR /usr/src/
 
 RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
 
@@ -31,7 +31,7 @@ Check out the help for more options:
 
 ```text
 usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
-                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}]
+                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}]
                                    [--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
                                    [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
                                    [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -67,10 +67,9 @@ Optional arguments:
                         on your local machine arbitrary code present in the model repository.
   --weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
                         The weight format of the exported model.
-  --quant-mode {int8}
+  --quant-mode {int8,f8e4m3,f8e5m2}
                         Quantization precision mode. This is used for applying full model quantization including
-                        activations. The only currently supported choice is 'int8' for int8 quantization of both
-                        weights and activations.
+                        activations.
   --library {transformers,diffusers,timm,sentence_transformers,open_clip}
                         The library used to load the model before export. If not provided, will attempt to infer the
                         local checkpoint's library
@@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
 </Tip>
 
 
-Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below.
+Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Please see example below.
 
 ```bash
 optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo
 
@@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--quant-mode",
         type=str,
-        choices=["int8"],
+        choices=["int8", "f8e4m3", "f8e5m2"],
         default=None,
         help=(
             "Quantization precision mode. This is used for applying full model quantization including activations. "
-            "The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
         ),
     )
     optional_group.add_argument(
@@ -365,8 +364,10 @@ def run(self):
                 quantization_config["trust_remote_code"] = self.args.trust_remote_code
             ov_config = OVConfig(quantization_config=quantization_config)
         else:
-            if self.args.quant_mode != "int8":
-                raise ValueError("Only 'int8' quantization mode is currently supported.")
+            if self.args.dataset is None:
+                raise ValueError(
+                    "Dataset is required for full quantization. Please provide it with --dataset argument."
+                )
 
             quantization_config = {
                 "weight_format": self.args.quant_mode,
 
@@ -1,3 +1,4 @@
+import os
 from typing import List, Optional, Tuple
 
 import torch
@@ -33,21 +34,23 @@ class IPEXPagedCache(Cache):
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int,
+        max_batch_size: int,
         max_cache_len: int,
         device,
         dtype=None,
         layer_device_map=None,
         **kwargs,
     ) -> None:
         super().__init__()
-        self.batch_size = batch_size
+        self.max_batch_size = max_batch_size
         # Used in `generate` to keep tally of how many tokens the cache has seen
-        self._seen_tokens = torch.zeros([batch_size], dtype=torch.int32, device=device)
-        self.block_size = 16
-        self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * batch_size
+
+        self._seen_tokens = torch.zeros([max_batch_size], dtype=torch.int32, device=device)
+        default_block_size = 16 if device.type == "cpu" else 64
+        self.block_size = int(os.environ.get("OI_PAGED_ATTN_BLOCK_SIZE", str(default_block_size)))
+        self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * max_batch_size
         self.block_tables = -1 * torch.ones([self.num_blocks], dtype=torch.int32, device=device).reshape(
-            batch_size, -1
+            max_batch_size, -1
         )
         self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=device)
         self.max_cache_len = max_cache_len
@@ -191,7 +194,7 @@ def get_max_length(self) -> Optional[int]:
 
     def reset(self):
         """Resets the cache values while preserving the objects"""
-        self._seen_tokens = torch.zeros([self.batch_size], dtype=torch.int32, device=self.block_tables.device)
+        self._seen_tokens = torch.zeros([self.max_batch_size], dtype=torch.int32, device=self.block_tables.device)
         self.block_tables.fill_(-1)
         self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=self.block_tables.device)
         self.max_seq_len = 0
 
@@ -14,7 +14,7 @@
 
 from transformers.models.bert.modeling_bert import BertIntermediate
 from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconModel
-from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2Model
+from transformers.models.gpt2.modeling_gpt2 import GPT2MLP, GPT2Attention, GPT2Block, GPT2Model
 from transformers.models.llama.modeling_llama import (
     LlamaDecoderLayer,
     LlamaModel,
@@ -27,6 +27,7 @@
 
 from .modeling_utils import (
     _IPEX_MINIMUM_VERSION_FOR_PATCHING,
+    _IPEXGPT2MLP,
     _falcon_model_forward,
     _gpt2_block_forward,
     _gpt2_model_forward,
@@ -40,8 +41,8 @@
 
 
 # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
-_TRANSFORMERS_MIN_VERSION = "4.46.0"
-_TRANSFORMERS_MAX_VERSION = "4.46.99"
+_TRANSFORMERS_MIN_VERSION = "4.47.0"
+_TRANSFORMERS_MAX_VERSION = "4.47.99"
 
 _IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)
 
@@ -111,6 +112,7 @@ def _patch_gpt2_model(model):
     convert_functions(model, GPT2Model, "forward", _gpt2_model_forward)
     convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
     convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.device, model.config)
+    convert_class(model, GPT2MLP, _IPEXGPT2MLP, model.device, model.config)
     return model