sshlyapn
diff --git a/‎src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp
+3 b/‎src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp
+3
diff --git a/‎src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp
+231-105 b/‎src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp
+231-105
diff --git a/‎src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h
+2-3 b/‎src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h
+2-3
diff --git a/‎src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+1 b/‎src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+1
diff --git a/‎src/plugins/intel_gpu/src/graph/network.cpp
+1-1 b/‎src/plugins/intel_gpu/src/graph/network.cpp
+1-1
diff --git a/‎src/plugins/intel_gpu/src/graph/paged_attention.cpp
+5-1 b/‎src/plugins/intel_gpu/src/graph/paged_attention.cpp
+5-1
diff --git a/‎src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+2 b/‎src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+2
diff --git a/‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/kv_cache_update_ref.cl ‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl
+7-7 b/‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/kv_cache_update_ref.cl ‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl
+7-7
diff --git a/‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_ref.cl
+178 b/‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_ref.cl
+178
diff --git a/‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl
+4 b/‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl
+4
diff --git a/‎src/plugins/intel_gpu/src/kernel_selector/kernels/paged_attention/kv_cache_update_kernel_ref.cpp
+8-8 b/‎src/plugins/intel_gpu/src/kernel_selector/kernels/paged_attention/kv_cache_update_kernel_ref.cpp
+8-8
diff --git a/‎src/plugins/intel_gpu/src/kernel_selector/kernels/paged_attention/kv_cache_update_kernel_ref.hpp
+5-5 b/‎src/plugins/intel_gpu/src/kernel_selector/kernels/paged_attention/kv_cache_update_kernel_ref.hpp
+5-5
@@ -4,6 +4,7 @@
 
 #pragma once
 #include "primitive.hpp"
+#include "intel_gpu/graph/program.hpp"
 
 #include <vector>
 
@@ -32,5 +33,7 @@ struct paged_attention : public primitive_base<paged_attention> {
     void load(BinaryInputBuffer& ib) override {
         primitive_base<paged_attention>::load(ib);
     }
+
+    std::shared_ptr<cldnn::program> prefill_stage;
 };
 }  // namespace cldnn
@@ -42,11 +42,10 @@ class typed_primitive_inst<paged_attention> : public typed_primitive_inst_base<p
     typed_primitive_inst(network& network, const paged_attention_node& desc);
     typed_primitive_inst(network& network) : parent(network) {}
 
+    std::shared_ptr<network> prefill_network;
+
 protected:
     void update_shape_info_tensor(const kernel_impl_params& params) override;
-
-private:
-    size_t paged_attention_id = 0;
 };
 
 using paged_attention_inst = typed_primitive_inst<paged_attention>;
 
@@ -224,6 +224,7 @@ class primitive_inst {
     void reset_output_change() { _output_changed = false; }
 
     bool shape_changed() const { return _shape_changed; }
+    void set_mem_changed(bool mem_changed) { _mem_changed = mem_changed; }
     bool mem_changed() const { return _mem_changed; }
     void reset_shape_change() { _shape_changed = false; }
     void set_shape_change() { _shape_changed = true; }
 
@@ -1054,7 +1054,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
             auto prog_id = ((get_program() != nullptr) ? get_program()->get_id() : 0);
             auto net_id = get_id();
             GPU_DEBUG_IF(debug_config->is_target_iteration(curr_iter) &&
-                        debug_config->is_layer_for_dumping(layer_name, inst->is_output(), inst->is_input())) {
+                        debug_config->is_layer_for_dumping(layer_name, inst->is_output(), inst->is_input()) && prog_id == 2) {
                 std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\""
                                                         + layer_name + ":";
                 for (size_t i = 0; i < get_primitive(layer_name)->outputs_memory_count(); i++) {
 
@@ -98,5 +98,9 @@ void paged_attention_inst::update_shape_info_tensor(const kernel_impl_params& pa
 }
 
 paged_attention_inst::typed_primitive_inst(network& network, const paged_attention_node& node)
-    : parent(network, node) {}
+    : parent(network, node)
+    , prefill_network(network::allocate_network(network.get_stream_ptr(),
+                                                node.get_primitive()->prefill_stage,
+                                                false,
+                                                network.is_primary_stream())) { }
 }  // namespace cldnn
@@ -230,6 +230,7 @@ void primitive_inst::check_memory_to_set(const memory& mem, const layout& layout
 }
 
 event::ptr primitive_inst::set_output_memory(memory::ptr mem_new, bool check, size_t idx) {
+    GPU_DEBUG_TRACE_DETAIL << "set_output memory for " << id() << ": " << mem_new << "\n";
     auto& eng = _network.get_engine();
     // skip all the buzz if no action actually required
     event::ptr ev = nullptr;
@@ -245,6 +246,7 @@ event::ptr primitive_inst::set_output_memory(memory::ptr mem_new, bool check, si
     if (is_constant()) {
         ev = mem_new->copy_from(_network.get_stream(), *_outputs[idx], false);
     } else {
+        GPU_DEBUG_TRACE_DETAIL << "change output memory: " << mem_new << "\n";
         ev = get_network().get_stream().create_user_event(true);
         _outputs[idx] = mem_new;
     }
 
@@ -4,7 +4,7 @@
 
 #include "include/batch_headers/common.cl"
 
-KERNEL(kv_cache_update)(
+KERNEL(pa_kv_cache_update)(
     OPTIONAL_SHAPE_INFO_ARG
     __global const INPUT0_TYPE* key_data,
     __global const INPUT1_TYPE* value_data,
@@ -15,21 +15,21 @@ KERNEL(kv_cache_update)(
 {
     const uint batch_idx = (uint)get_global_id(0);
     const uint seq_idx = (uint)get_global_id(1);
-    const uint hidden_idx = (uint)get_global_id(2); /* head_size */
+    const uint head_elem_idx = (uint)get_global_id(2);
 
-    const uint in_offset = batch_idx * INPUT0_BATCH_PITCH + seq_idx * INPUT0_FEATURE_PITCH + hidden_idx;
+    const uint in_offset = batch_idx * INPUT0_BATCH_PITCH + seq_idx * INPUT0_FEATURE_PITCH + head_elem_idx;
     const uint slot_offset = batch_idx * INPUT0_FEATURE_NUM + seq_idx;
 
     const INPUT2_TYPE slot_idx = slot_mapping[slot_offset];
-    if (hidden_idx >= INPUT0_FEATURE_PITCH || slot_idx == -1)
+    if (head_elem_idx >= INPUT0_FEATURE_PITCH || slot_idx == -1)
         return;
 
     const uint block_index = slot_idx / KV_CACHE_BLOCK_SIZE;
     const uint block_offset = slot_idx % KV_CACHE_BLOCK_SIZE;
 
 #ifdef VALUE_CACHE_UPDATE
     const uint out_offset = block_elem_num * block_index +
-                            hidden_idx * KV_CACHE_BLOCK_SIZE +
+                            head_elem_idx * KV_CACHE_BLOCK_SIZE +
                             block_offset;
 
     // if (INPUT0_FEATURE_NUM == 18 && INPUT0_BATCH_NUM == 2) {
@@ -39,8 +39,8 @@ KERNEL(kv_cache_update)(
     value_cache_data[out_offset] = value_data[in_offset];
 #else
     #define HEAD_SIZE_BLOCKING 4
-    const uint head_size_outer_block = hidden_idx / HEAD_SIZE_BLOCKING;
-    const uint head_size_inner_block = hidden_idx % HEAD_SIZE_BLOCKING;
+    const uint head_size_outer_block = head_elem_idx / HEAD_SIZE_BLOCKING;
+    const uint head_size_inner_block = head_elem_idx % HEAD_SIZE_BLOCKING;
 
     const uint out_offset = block_elem_num * block_index +
                             block_offset * HEAD_SIZE_BLOCKING +
 
@@ -0,0 +1,178 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/common.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
+
+
+
+
+
+// constexpr size_t HEAD_SIZE = 64;
+// constexpr size_t HEADS_NUM = 32;
+// constexpr size_t KV_HEADS_NUM = 4;
+// constexpr size_t BLOCK_SIZE = 16;
+// constexpr size_t X_SIZE = 4;
+
+// constexpr size_t MAX_SEQUENCE_LENGTH = 1024;
+
+
+
+#define SUB_GROUP_SIZE 16
+
+// The size of portion of HEAD_SIZE each WI process
+#define HEAD_ITEMS_PER_WI (HEAD_SIZE / SUB_GROUP_SIZE)
+
+// How much QK outputs each subgroup calculates per cycle
+#define QK_PER_SG 4
+
+#define KV_CACHE_BLOCK_STRIDE (HEAD_SIZE * HEADS_NUM * BLOCK_SIZE)
+
+#define QUERY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset)
+
+#define SUBGROUPS_PER_WG HEAD_SIZE / SUB_GROUP_SIZE
+
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
+__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
+KERNEL(pa_sdpa_ref)(
+    OPTIONAL_SHAPE_INFO_ARG
+    __global const INPUT0_TYPE* query,
+    __global const INPUT1_TYPE* key_cache,
+    __global const INPUT2_TYPE* value_cache,
+    __global const INPUT3_TYPE* max_context_len,
+    __global const INPUT4_TYPE* context_lens,
+    __global const INPUT5_TYPE* block_tables,
+    __global OUTPUT_TYPE* output)
+{
+    const uint seq_idx = get_global_id(0);
+    const uint head_num_idx = get_global_id(1);
+    const uint head_idx = get_global_id(2);
+    const uint sglid = get_sub_group_local_id();
+    const uint sgid = get_sub_group_id();
+
+    const uint batch_idx = seq_idx / INPUT0_FEATURE_NUM;
+    const uint token_idx = seq_idx % INPUT0_FEATURE_NUM;
+
+    const uint context_len = context_lens[batch_idx];
+
+    const uint blocks_num = INPUT5_FEATURE_NUM;
+
+    // sgid0: 0..3
+    // sgid1: 4..7
+    // sgid2: 8..11
+    // sgid3: 12..15
+
+    // sgid0: 16..19
+    // sgid1: 20..23
+    // sgid2: 24..27
+    // sgid3: 28..31
+
+    // TODO: Need to make blocks division more flexible. Current approach suggests
+    // to have 4 SG per WG, where each SG process 4 QK outputs, so 16 in total per WG
+
+    __local OUTPUT_TYPE qk_vals[SHARED_MEM_SIZE];
+
+    OUTPUT_TYPE qk_max = OUTPUT_VAL_MIN;
+
+    for (uint block = 0; block < blocks_num; block++) {
+        const uint block_idx = batch_idx * blocks_num + block;
+        const uint block_offset = block_tables[block_idx] * KV_CACHE_BLOCK_STRIDE;
+
+        OUTPUT_TYPE qk[QK_PER_SG] = {0};
+
+        for (uint hs = 0; hs < HEAD_ITEMS_PER_WI; hs++) {
+            const uint query_idx = seq_idx * HEAD_SIZE * HEADS_NUM + hs * SUB_GROUP_SIZE;
+
+            // TODO: can be preloaded outside HEAD_ITEMS_PER_WI loop - need to check perf
+            INPUT0_TYPE q = QUERY_BLOCK_READ(query, query_idx);
+            for (uint qk_idx = 0; qk_idx < QK_PER_SG; qk_idx++) {
+                uint current_token = block * BLOCK_SIZE + sgid * QK_PER_SG + qk_idx;
+                if (current_token >= context_len)
+                    continue;
+
+                const uint key_idx = block_offset +
+                                     (X_SIZE * QK_PER_SG) * sgid +
+                                     (HEAD_ITEMS_PER_WI * BLOCK_SIZE * X_SIZE) * hs +
+                                     (sglid / X_SIZE) * X_SIZE * BLOCK_SIZE +
+                                     (sglid % X_SIZE) + qk_idx * X_SIZE;
+                // TODO1: try block loading and shuffling
+                // TODO2: try to load k*4 times and then calculate
+                // TODO3: try bigger X block
+                INPUT1_TYPE k = key_cache[key_idx];
+
+                qk[qk_idx] = mad(q, k, qk[qk_idx]);
+            }
+        }
+
+        // Summurize qk calculation across all WIs
+        for (uint qk_idx = 0; qk_idx < QK_PER_SG; qk_idx++) {
+            qk[QK_PER_SG] = sub_group_reduce_add(qk[QK_PER_SG]);
+            qk_max = OUTPUT_MAX_FUNC(qk_max, qk[QK_PER_SG]);
+        }
+
+        // Save QK results to local memory
+        if (sglid < QK_PER_SG) {
+            const uint qk_local_idx = block * BLOCK_SIZE * sgid * QK_PER_SG + sglid;
+            qk_vals[qk_local_idx] = qk[sglid];
+        }
+    }
+
+    /* WARNING NEED TO ADD BIAS BEFORE SOFTMAX */
+
+    // Apply SoftMax operation
+    __local OUTPUT_TYPE qk_max_vals[SUBGROUPS_PER_WG];
+    __local OUTPUT_TYPE qk_sum_vals[SUBGROUPS_PER_WG];
+    {
+        if (sglid == 0)
+            qk_max_vals[sgid] = qk_max;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        qk_max = OUTPUT_VAL_MIN;
+        if (sglid < SUBGROUPS_PER_WG)
+            qk_max = qk_max_vals[sglid];
+
+        // Final max value after reduction across of all SG and WI
+        qk_max = sub_group_reduce_max(qk_max);
+
+        OUTPUT_TYPE exp_sum = OUTPUT_VAL_ZERO;
+        for (uint qk_idx = 0; qk_idx < CEIL_DIV(context_len, SUBGROUPS_PER_WG * SUB_GROUP_SIZE); qk_idx++) {
+            const uint data_idx = qk_idx * (SUBGROUPS_PER_WG * SUB_GROUP_SIZE) + sgid * SUB_GROUP_SIZE + sglid;
+            if (data_idx < context_len) {
+                OUTPUT_TYPE val = native_exp(qk_vals[data_idx] - qk_max);
+                exp_sum += val;
+                qk_vals[data_idx] = val;
+            }
+        }
+
+        exp_sum = sub_group_reduce_add(exp_sum);
+
+        if (sglid == 0)
+            qk_sum_vals[sgid] = exp_sum;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (sglid < SUBGROUPS_PER_WG)
+            exp_sum = qk_sum_vals[sglid];
+
+        // Final sum of all values
+        exp_sum = sub_group_reduce_add(exp_sum);
+
+        const OUTPUT_TYPE inv_sum = OUTPUT_VAL_ONE / exp_sum;
+
+        for (uint qk_idx = 0; qk_idx < CEIL_DIV(context_len, SUBGROUPS_PER_WG * SUB_GROUP_SIZE); qk_idx++) {
+            const uint data_idx = qk_idx * (SUBGROUPS_PER_WG * SUB_GROUP_SIZE) + sgid * SUB_GROUP_SIZE + sglid;
+            if (data_idx < context_len) {
+                OUTPUT_TYPE val = qk_vals[data_idx] * inv_sum;
+                qk_vals[data_idx] = val;
+            }
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    output[seq_idx + sglid] = qk_vals[sglid % context_len];
+}
@@ -44,6 +44,10 @@ KERNEL (softmax_gpu_continuous_bfyx)(
     const uint power = CALC_POWER(workers_per_data_set);
     const uint items_num = data_set_size>>power;
     const uint leftovers = data_set_size-(items_num<<power);
+    if (data_set_idx == 0 && in_data_set_idx == 0) {
+        printf("Power=%d, items_num=%d, letfovers=%d, data_set_size=%d, sub_group_size=%d\n", power, items_num,
+                                                                                              leftovers, data_set_size, get_sub_group_size());
+    }
 #endif
 
     const uint data_set_offset = data_set_idx * data_set_size;
 
@@ -13,7 +13,7 @@ static constexpr size_t kv_cache_block_size = 16;
 
 void KVCacheUpdateKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) const {
     kd.update_dispatch_data_func = [](const Params& params, KernelData& kd) {
-        const auto& prim_params = dynamic_cast<const kv_cache_update_update_params&>(params);
+        const auto& prim_params = dynamic_cast<const kv_cache_update_params&>(params);
         auto dispatchData = SetDefault(prim_params);
         OPENVINO_ASSERT(kd.kernels.size() == 2, "[GPU] Invalid kernels size for update dispatch data func");
         kd.kernels[0].params.workGroups.global = dispatchData.gws;
@@ -40,11 +40,11 @@ KernelsData KVCacheUpdateKernelRef::GetKernelsData(const Params& params) const {
         return {};
     }
 
-    KernelData kd = KernelData::Default<kv_cache_update_update_params>(params, 2);
+    KernelData kd = KernelData::Default<kv_cache_update_params>(params, 2);
     kd.needs_sub_kernels_sync = false;
     GetUpdateDispatchDataFunc(kd);
 
-    const auto& kernel_params = static_cast<const kv_cache_update_update_params&>(params);
+    const auto& kernel_params = static_cast<const kv_cache_update_params&>(params);
     for (size_t i = 0; i < 2; i++) {
         const auto kernel_stage = i == 0 ? KernelMode::value_cache_update : KernelMode::key_cache_update;
         const auto dispatch_data = SetDefault(kernel_params);
@@ -101,7 +101,7 @@ bool KVCacheUpdateKernelRef::Validate(const Params& params) const {
         return false;
     }
 
-    const auto& kernel_params = dynamic_cast<const kv_cache_update_update_params&>(params);
+    const auto& kernel_params = dynamic_cast<const kv_cache_update_params&>(params);
     if (kernel_params.inputs.size() != 3)
         return false;
 
@@ -114,7 +114,7 @@ bool KVCacheUpdateKernelRef::Validate(const Params& params) const {
     return true;
 }
 
-JitConstants KVCacheUpdateKernelRef::GetJitConstants(const kv_cache_update_update_params& kernel_params, KernelMode mode) const {
+JitConstants KVCacheUpdateKernelRef::GetJitConstants(const kv_cache_update_params& kernel_params, KernelMode mode) const {
     JitConstants jit = MakeBaseParamsJitConstants(kernel_params);
 
     if (mode == KernelMode::key_cache_update)
@@ -127,7 +127,7 @@ JitConstants KVCacheUpdateKernelRef::GetJitConstants(const kv_cache_update_updat
     return jit;
 }
 
-CommonDispatchData KVCacheUpdateKernelRef::SetDefault(const kv_cache_update_update_params& kernel_params) {
+CommonDispatchData KVCacheUpdateKernelRef::SetDefault(const kv_cache_update_params& kernel_params) {
     CommonDispatchData dispatch_data;
 
     const auto& input = kernel_params.inputs[0];
@@ -139,8 +139,8 @@ CommonDispatchData KVCacheUpdateKernelRef::SetDefault(const kv_cache_update_upda
         const size_t batch_size = input.Batch().v;
         const size_t seq_len = input.Feature().v;
         const size_t tokens_num = batch_size * seq_len;
-        const size_t hidden_size = input.LogicalSize() / (tokens_num);
-        dispatch_data.gws = {batch_size, seq_len, Align(hidden_size, 16)};
+        const size_t head_size = input.LogicalSize() / (tokens_num);
+        dispatch_data.gws = {batch_size, seq_len, Align(head_size, 16)};
         dispatch_data.lws = {1, 1, 16};
     }
 
 
@@ -13,20 +13,20 @@ enum class KernelMode {
     value_cache_update
 };
 
-struct kv_cache_update_update_params : base_params {
-    kv_cache_update_update_params() : base_params(KernelType::PA_KV_CACHE_UPDATE) {}
+struct kv_cache_update_params : base_params {
+    kv_cache_update_params() : base_params(KernelType::PA_KV_CACHE_UPDATE) {}
 };
 
 class KVCacheUpdateKernelRef : public KernelBaseOpenCL {
 public:
-    KVCacheUpdateKernelRef() : KernelBaseOpenCL{"kv_cache_update_ref"} {}
+    KVCacheUpdateKernelRef() : KernelBaseOpenCL{"pa_kv_cache_update_ref"} {}
     KernelsData GetKernelsData(const Params& params) const override;
     ParamsKey GetSupportedKey() const override;
 
 protected:
     bool Validate(const Params& params) const override;
-    JitConstants GetJitConstants(const kv_cache_update_update_params& kernel_params, KernelMode mode) const;
-    static CommonDispatchData SetDefault(const kv_cache_update_update_params& kernel_params);
+    JitConstants GetJitConstants(const kv_cache_update_params& kernel_params, KernelMode mode) const;
+    static CommonDispatchData SetDefault(const kv_cache_update_params& kernel_params);
     void GetUpdateDispatchDataFunc(KernelData& kd) const override;
 };
Original file line number	Diff line number	Diff line change
`@@ -230,6 +230,7 @@ void primitive_inst::check_memory_to_set(const memory& mem, const layout& layout`
`230`	`230`	`}`
`231`	`231`
`232`	`232`	`event::ptr primitive_inst::set_output_memory(memory::ptr mem_new, bool check, size_t idx) {`
	`233`	`+ GPU_DEBUG_TRACE_DETAIL << "set_output memory for " << id() << ": " << mem_new << "\n";`
`233`	`234`	`auto& eng = _network.get_engine();`
`234`	`235`	`// skip all the buzz if no action actually required`
`235`	`236`	`event::ptr ev = nullptr;`
`@@ -245,6 +246,7 @@ event::ptr primitive_inst::set_output_memory(memory::ptr mem_new, bool check, si`
`245`	`246`	`if (is_constant()) {`
`246`	`247`	`ev = mem_new->copy_from(_network.get_stream(), *_outputs[idx], false);`
`247`	`248`	`} else {`
	`249`	`+ GPU_DEBUG_TRACE_DETAIL << "change output memory: " << mem_new << "\n";`
`248`	`250`	`ev = get_network().get_stream().create_user_event(true);`
`249`	`251`	`_outputs[idx] = mem_new;`
`250`	`252`	`}`