huggingface
diff --git a/‎.github/workflows/test_generation.yml
-36 b/‎.github/workflows/test_generation.yml
-36
diff --git a/‎.github/workflows/test_ipex.yml
+2-2 b/‎.github/workflows/test_ipex.yml
+2-2
diff --git a/‎examples/neural_compressor/text-generation/README.md
-2 b/‎examples/neural_compressor/text-generation/README.md
-2
diff --git a/‎optimum/exporters/ipex/cache_utils.py
+48-21 b/‎optimum/exporters/ipex/cache_utils.py
+48-21
@@ -18,8 +18,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        transformers-version: ["4.47.0", "4.47.1"]
-        torch-version: ["2.4.0", "2.5.*"]
+        transformers-version: ["4.47.*"]
+        torch-version: ["2.6.0"]
 
     runs-on: ubuntu-22.04
 
 
@@ -18,8 +18,6 @@ limitations under the License.
 
 Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py).
 
-The original generation task only supported the PyTorch eager model. By calling the `TSModelForCausalLM` class, we can now support a TorchScript model for generation tasks.
-
 This example also allows us to apply different quantization approaches (such as dynamic, static, The example applies post-training static quantization on a gptj model).
 
 Example usage:
 
@@ -5,6 +5,8 @@
 from intel_extension_for_pytorch.llm.modules import PagedAttention
 from transformers import Cache, PretrainedConfig
 
+from optimum.intel.utils.import_utils import is_ipex_version
+
 
 class IPEXPagedCache(Cache):
     """
@@ -43,10 +45,14 @@ def __init__(
     ) -> None:
         super().__init__()
         self.max_batch_size = max_batch_size
+        self.device = device
+        self._supports_flash_decoding = (
+            is_ipex_version(">", "2.4.99") if device.type == "cpu" else is_ipex_version(">", "2.5.99")
+        )
         # Used in `generate` to keep tally of how many tokens the cache has seen
 
         self._seen_tokens = torch.zeros([max_batch_size], dtype=torch.int32, device=device)
-        default_block_size = 16 if device.type == "cpu" else 64
+        default_block_size = 16
         self.block_size = int(os.environ.get("OI_PAGED_ATTN_BLOCK_SIZE", str(default_block_size)))
         self.num_blocks = (max_cache_len // self.block_size + (max_cache_len % self.block_size != 0)) * max_batch_size
         self.block_tables = -1 * torch.ones([self.num_blocks], dtype=torch.int32, device=device).reshape(
@@ -70,14 +76,44 @@ def __init__(
             key_cache_shape = (self.num_blocks, self.num_kv_heads, self.block_size, head_size)
             value_cache_shape = (self.num_blocks, self.num_kv_heads, self.block_size, head_size)
         elif device.type == "xpu":
-            key_cache_shape = (self.num_blocks, self.num_kv_heads, head_size, self.block_size, 1)
-            value_cache_shape = (self.num_blocks, self.num_kv_heads, head_size, self.block_size)
+            if self._supports_flash_decoding:
+                key_cache_shape = (self.num_blocks, self.block_size, self.num_kv_heads, head_size)
+                value_cache_shape = (self.num_blocks, self.block_size, self.num_kv_heads, head_size)
+            else:
+                key_cache_shape = (self.num_blocks, self.num_kv_heads, head_size, self.block_size, 1)
+                value_cache_shape = (self.num_blocks, self.num_kv_heads, head_size, self.block_size)
         for i in range(config.num_hidden_layers):
             new_layer_key_cache = torch.zeros(key_cache_shape, dtype=dtype, device=device)
             new_layer_value_cache = torch.zeros(value_cache_shape, dtype=dtype, device=device)
             self.key_cache.append(new_layer_key_cache)
             self.value_cache.append(new_layer_value_cache)
 
+    def reshape_and_cache(
+        self,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slots: torch.Tensor,
+    ):
+        # TODO: unify API definition between CPU and XPU in IPEX version > 2.6
+        if self.device.type == "xpu" and self._supports_flash_decoding:
+            PagedAttention.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slots,
+            )
+        else:
+            PagedAttention.reshape_and_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slots,
+            )
+
     def update_for_prefill(
         self,
         key_states: torch.Tensor,
@@ -95,7 +131,7 @@ def update_for_prefill(
                 block_table = self.free_blocks.nonzero().view(-1)[0:nb]
                 self.block_tables[i][0:nb] = block_table
                 self.free_blocks[block_table] = 0
-                slots_range = torch.arange(input_lens[i], device=key_states.device)
+                slots_range = torch.arange(input_lens[i], device=self.device)
                 block_indices = slots_range // self.block_size
                 slot_offsets = slots_range % self.block_size
                 all_block_indices.append(self.block_tables[i][block_indices])
@@ -105,12 +141,8 @@ def update_for_prefill(
             all_slot_offsets = torch.cat(all_slot_offsets)
             self.slots = all_block_indices * self.block_size + all_slot_offsets
         # Update the cache
-        PagedAttention.reshape_and_cache(
-            key_states,
-            value_states,
-            self.key_cache[layer_idx],
-            self.value_cache[layer_idx],
-            self.slots,
+        self.reshape_and_cache(
+            key_states, value_states, self.key_cache[layer_idx], self.value_cache[layer_idx], self.slots
         )
 
         # Update the number of seen tokens
@@ -128,7 +160,7 @@ def update_for_decode(
         if layer_idx == 0:
             start_block_idx = self._seen_tokens // self.block_size
             slot_offset_in_block = (self._seen_tokens) % self.block_size
-            self.slots = torch.zeros([batch_size], device=key_states.device, dtype=torch.int32)
+            self.slots = torch.zeros([batch_size], device=self.device, dtype=torch.int32)
             for i in range(batch_size):
                 if slot_offset_in_block[i] == 0:
                     # need a new block:
@@ -139,12 +171,8 @@ def update_for_decode(
                         self.free_blocks[self.block_tables[i][b_idx]] = 0
                 self.slots[i] = self.block_tables[i][start_block_idx[i]] * self.block_size + slot_offset_in_block[i]
         # Update the cache
-        PagedAttention.reshape_and_cache(
-            key_states,
-            value_states,
-            self.key_cache[layer_idx],
-            self.value_cache[layer_idx],
-            self.slots,
+        self.reshape_and_cache(
+            key_states, value_states, self.key_cache[layer_idx], self.value_cache[layer_idx], self.slots
         )
 
         # Update the number of seen tokens
@@ -194,16 +222,15 @@ def get_max_length(self) -> Optional[int]:
 
     def reset(self):
         """Resets the cache values while preserving the objects"""
-        self._seen_tokens = torch.zeros([self.max_batch_size], dtype=torch.int32, device=self.block_tables.device)
+        self._seen_tokens = torch.zeros([self.max_batch_size], dtype=torch.int32, device=self.device)
         self.block_tables.fill_(-1)
-        self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=self.block_tables.device)
+        self.free_blocks = torch.ones([self.num_blocks], dtype=torch.int32, device=self.device)
         self.max_seq_len = 0
 
     def reorder_cache(self, beam_idx: torch.LongTensor):
         """Reorders the cache for beam search, given the selected beam indices."""
-        device = self.block_tables.device
         origin_table = self.block_tables.clone()
-        updated_block_tables = self.block_tables.index_select(0, beam_idx.to(device))
+        updated_block_tables = self.block_tables.index_select(0, beam_idx.to(self.device))
         mask = self.block_tables.masked_fill(self.block_tables != -1, 1).masked_fill(self.block_tables == -1, 0)
         num_blocks = mask.cumsum(-1)[:, -1]
         updated_table = torch.zeros_like(beam_idx)