[GPU] PagedAttention initial impl

sshlyapn · sshlyapn · commit f9866affcdd2 · 2024-03-08T18:19:47.000+04:00
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp
@@ -100,7 +100,8 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
                             value_cache_mem,              /* value_cache */
                             instance.input_memory_ptr(7), /* max_context_len */
                             instance.input_memory_ptr(8), /* context_lens */
-                            instance.input_memory_ptr(9)  /* block_tables */ };
+                            instance.input_memory_ptr(9), /* block_tables */
+                            instance.input_memory_ptr(10) /* scale */ };
             args.outputs = { instance.output_memory_ptr(0) };
         }
 
@@ -279,20 +280,22 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
     static sdpa_kernel_params_t get_sdpa_kernel_params(const kernel_impl_params& impl_param, bool is_dynamic = false) {
         auto params = get_default_params<kernel_selector::sdpa_params>(impl_param, is_dynamic);
 
-        const auto inputs_count = 6;
+        const auto inputs_count = 7;
         const auto query_layout = impl_param.get_input_layout(0);
         const auto key_cache_layout = impl_param.get_input_layout(3);
         const auto value_cache_layout = impl_param.get_input_layout(4);
         const auto max_context_len_layout = impl_param.get_input_layout(7);
         const auto context_lens_layout = impl_param.get_input_layout(8);
         const auto block_tables_layout = impl_param.get_input_layout(9);
+        const auto scale_layout = impl_param.get_input_layout(10);
 
         params.inputs.resize(inputs_count);
         params.inputs[1] = convert_data_tensor(key_cache_layout);
         params.inputs[2] = convert_data_tensor(value_cache_layout);
         params.inputs[3] = convert_data_tensor(max_context_len_layout);
         params.inputs[4] = convert_data_tensor(context_lens_layout);
         params.inputs[5] = convert_data_tensor(block_tables_layout);
+        params.inputs[6] = convert_data_tensor(scale_layout);
 
         if (query_layout.is_static() && key_cache_layout.is_static() && value_cache_layout.is_static()) {
             // query_shape = [batch_size, seq_len, heads_num * head_size]
@@ -328,6 +331,7 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
             {3, in_offsets_map.at(7)},
             {4, in_offsets_map.at(8)},
             {5, in_offsets_map.at(9)},
+            {6, in_offsets_map.at(10)},
         };
         std::map<size_t, size_t> out_tensor_to_offset_map = {
             {0, out_offsets_map.at(0)},
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -29,6 +29,7 @@
 #include "kv_cache_inst.h"
 #include "condition_inst.h"
 #include "gather_inst.h"
+#include "paged_attention_inst.h"
 #include "experimental_detectron_roi_feature_extractor_inst.hpp"
 #include "implementation_map.hpp"
 #include "graph_optimizer/prepare_buffer_fusing.h"
@@ -553,6 +554,12 @@ event::ptr primitive_inst::realloc_if_needed() {
         }
     }
 
+    // WA: reallocate memory for PA if previous memory is usm_host used from prefill stage inner model
+    if (_node->is_type<paged_attention>() && _outputs[0] && _outputs[0]->get_allocation_type() != allocation_type::usm_device) {
+        GPU_DEBUG_TRACE_DETAIL << id() << " reset memory\n";
+        _max_output_layout_count = 0;
+    }
+
     // update layout to ensure that it repsects paddings for correct allocation size
     if (_node_output_layout.data_padding.get_dynamic_pad_dims() != tensor(0)) {
         size_t rank = updated_layout.get_shape().size();
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl
@@ -32,8 +32,8 @@ KERNEL(pa_kv_cache_update)(
                             head_elem_idx * KV_CACHE_BLOCK_SIZE +
                             block_offset;
 
-    // if (INPUT0_FEATURE_NUM == 18 && INPUT0_BATCH_NUM == 2) {
-    //     printf("%d. %d - value\n", out_offset, in_offset);
+    // if (batch_idx == 0) {
+    //     printf("Update value %d. %d (%f)\n", out_offset, in_offset, value_data[in_offset]);
     // }
 
     value_cache_data[out_offset] = value_data[in_offset];
@@ -46,9 +46,9 @@ KERNEL(pa_kv_cache_update)(
                             block_offset * HEAD_SIZE_BLOCKING +
                             head_size_outer_block * KV_CACHE_BLOCK_SIZE * HEAD_SIZE_BLOCKING +
                             head_size_inner_block;
-    // if (INPUT0_FEATURE_NUM == 18 && INPUT0_BATCH_NUM == 2) {
-    //     printf("%d. %d - key\n", out_offset, in_offset);
+    // if (batch_idx == 0) {
+    //     printf("Update key_cache %d. %d (%f)\n", out_offset, in_offset, key_data[in_offset]);
     // }
-    value_cache_data[out_offset] = key_data[in_offset];
+    key_cache_data[out_offset] = key_data[in_offset];
 #endif
 }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_ref.cl
@@ -14,6 +14,7 @@
 // constexpr size_t HEAD_SIZE = 64;
 // constexpr size_t HEADS_NUM = 32;
 // constexpr size_t KV_HEADS_NUM = 4;
+// constexpr NUM_QUERIES_PER_KV_HEAD (HEADS_NUM / KV_HEADS_NUM)
 // constexpr size_t BLOCK_SIZE = 16;
 // constexpr size_t X_SIZE = 4;
 
@@ -29,14 +30,14 @@
 // How much QK outputs each subgroup calculates per cycle
 #define QK_PER_SG 4
 
-#define KV_CACHE_BLOCK_STRIDE (HEAD_SIZE * HEADS_NUM * BLOCK_SIZE)
+#define KV_CACHE_BLOCK_STRIDE (HEAD_SIZE * KV_HEADS_NUM * BLOCK_SIZE)
 
 #define QUERY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset)
 
 #define SUBGROUPS_PER_WG HEAD_SIZE / SUB_GROUP_SIZE
 
 REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
-__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
+__attribute__((reqd_work_group_size(1, 1, 64)))
 KERNEL(pa_sdpa_ref)(
     OPTIONAL_SHAPE_INFO_ARG
     __global const INPUT0_TYPE* query,
@@ -45,6 +46,7 @@ KERNEL(pa_sdpa_ref)(
     __global const INPUT3_TYPE* max_context_len,
     __global const INPUT4_TYPE* context_lens,
     __global const INPUT5_TYPE* block_tables,
+    __global const INPUT6_TYPE* scale,
     __global OUTPUT_TYPE* output)
 {
     const uint seq_idx = get_global_id(0);
@@ -60,6 +62,30 @@ KERNEL(pa_sdpa_ref)(
 
     const uint blocks_num = INPUT5_FEATURE_NUM;
 
+    // if (seq_idx < 2 && head_num_idx < 2 && sgid < 2 && sglid < 2) {
+    //     if (INPUT5_FEATURE_NUM == 0) {
+    //         printf("Empty blocks. Seq_idx=%d, head_num_idx=%d, head_idx=%d, sglid=%d, sgid=%d, batch_idx=%d, token_idx=%d, context_len=%d, scale=%f\n",
+    //         seq_idx, head_num_idx, head_idx, sglid, sgid, batch_idx, token_idx, context_len, scale[0]);
+    //     } else if (INPUT5_FEATURE_NUM == 1) {
+    //         printf("Blocks table[b=0]: %d. Seq_idx=%d, head_num_idx=%d, head_idx=%d, sglid=%d, sgid=%d, batch_idx=%d, token_idx=%d, context_len=%d, scale=%f\n", block_tables[0],
+    //         seq_idx, head_num_idx, head_idx, sglid, sgid, batch_idx, token_idx, context_len, scale[0]);
+    //     } else if (INPUT5_FEATURE_NUM == 2) {
+    //         printf("Blocks table[b=0]: %d %d. Seq_idx=%d, head_num_idx=%d, head_idx=%d, sglid=%d, sgid=%d, batch_idx=%d, token_idx=%d, context_len=%d, scale=%f\n", block_tables[0], block_tables[1],
+    //         seq_idx, head_num_idx, head_idx, sglid, sgid, batch_idx, token_idx, context_len, scale[0]);
+    //     } else if (INPUT5_FEATURE_NUM == 3) {
+    //         printf("Blocks table[b=0]: %d %d %d. Seq_idx=%d, head_num_idx=%d, head_idx=%d, sglid=%d, sgid=%d, batch_idx=%d, token_idx=%d, context_len=%d, scale=%f\n", block_tables[0], block_tables[1], block_tables[2],
+    //         seq_idx, head_num_idx, head_idx, sglid, sgid, batch_idx, token_idx, context_len, scale[0]);
+    //     } else if (INPUT5_FEATURE_NUM == 4) {
+    //         printf("Blocks table[b=0]: %d %d %d %d. Seq_idx=%d, head_num_idx=%d, head_idx=%d, sglid=%d, sgid=%d, batch_idx=%d, token_idx=%d, context_len=%d, scale=%f\n", block_tables[0], block_tables[1], block_tables[2], block_tables[3],
+    //         seq_idx, head_num_idx, head_idx, sglid, sgid, batch_idx, token_idx, context_len, scale[0]);
+    //     }
+
+    //     if (seq_idx == 0 && head_num_idx == 0 && sgid == 0 && sglid == 0) {
+    //         printf("key_cache[405504]=%f\n", key_cache[405504]);
+    //         printf("value_cache[405504]=%f\n", value_cache[405504]);
+    //     }
+    // }
+
     // sgid0: 0..3
     // sgid1: 4..7
     // sgid2: 8..11
@@ -84,7 +110,9 @@ KERNEL(pa_sdpa_ref)(
         OUTPUT_TYPE qk[QK_PER_SG] = {0};
 
         for (uint hs = 0; hs < HEAD_ITEMS_PER_WI; hs++) {
-            const uint query_idx = seq_idx * HEAD_SIZE * HEADS_NUM + hs * SUB_GROUP_SIZE;
+            const uint query_idx = seq_idx * HEAD_SIZE * HEADS_NUM +
+                                   head_num_idx * HEAD_SIZE +
+                                   hs * SUB_GROUP_SIZE;
 
             // TODO: can be preloaded outside HEAD_ITEMS_PER_WI loop - need to check perf
             INPUT0_TYPE q = QUERY_BLOCK_READ(query, query_idx);
@@ -94,34 +122,53 @@ KERNEL(pa_sdpa_ref)(
                     continue;
 
                 const uint key_idx = block_offset +
+                                     (head_num_idx / NUM_QUERIES_PER_KV_HEAD) * (HEAD_SIZE / X_SIZE * BLOCK_SIZE * X_SIZE) +
                                      (X_SIZE * QK_PER_SG) * sgid +
                                      (HEAD_ITEMS_PER_WI * BLOCK_SIZE * X_SIZE) * hs +
                                      (sglid / X_SIZE) * X_SIZE * BLOCK_SIZE +
                                      (sglid % X_SIZE) + qk_idx * X_SIZE;
+
                 // TODO1: try block loading and shuffling
                 // TODO2: try to load k*4 times and then calculate
                 // TODO3: try bigger X block
                 INPUT1_TYPE k = key_cache[key_idx];
 
+
+                // if (seq_idx == 0 && head_num_idx == 0) {
+                //     printf("main_calc: seq_idx=%d, head_num_idx=%d, sgid=%d, sglid=%d, block=%d, hs=%d, qk_idx=%d, current_token=%d, query_idx=%d, key_idx=%d (block_offset=%d): %f * %f\n",
+                //         seq_idx, head_num_idx, sgid, sglid, block, hs, qk_idx, current_token, query_idx, key_idx - block_offset, block_offset, q, k);
+                // }
+
                 qk[qk_idx] = mad(q, k, qk[qk_idx]);
             }
         }
 
-        // Summurize qk calculation across all WIs
+        // Summurize qk calculation across all WIs and apply scale
         for (uint qk_idx = 0; qk_idx < QK_PER_SG; qk_idx++) {
-            qk[QK_PER_SG] = sub_group_reduce_add(qk[QK_PER_SG]);
-            qk_max = OUTPUT_MAX_FUNC(qk_max, qk[QK_PER_SG]);
+            const uint current_token = block * BLOCK_SIZE + sgid * QK_PER_SG + qk_idx;
+            if (current_token < context_len) {
+                OUTPUT_TYPE tmp_print = qk[qk_idx];
+                qk[qk_idx] = sub_group_reduce_add(qk[qk_idx]);
+                // if (head_num_idx < 4)
+                //     printf("final_calc: seq_idx=%d, head_num_idx=%d, sgid=%d, sglid=%d: before qk[%d]=%f, after=%f\n",
+                //             seq_idx, head_num_idx, sgid, sglid, qk_idx, tmp_print, qk[qk_idx]);
+                qk[qk_idx] = scale[0] * qk[qk_idx];
+                qk_max = OUTPUT_MAX_FUNC(qk_max, qk[qk_idx]);
+            }
         }
 
         // Save QK results to local memory
         if (sglid < QK_PER_SG) {
-            const uint qk_local_idx = block * BLOCK_SIZE * sgid * QK_PER_SG + sglid;
-            qk_vals[qk_local_idx] = qk[sglid];
+            const uint current_token = block * BLOCK_SIZE + sgid * QK_PER_SG + sglid;
+            // Fixed -> // const uint qk_local_idx = block * BLOCK_SIZE * sgid * QK_PER_SG + sglid;
+            // OUTPUT_TYPE tmp_print = (current_token >= context_len ? 0 : qk[sglid]);
+            // if (head_num_idx < 4 || head_num_idx == 31)
+            //     printf("slm save: seq_idx=%d, head_num_idx=%d, sgid=%d, sglid=%d: qk_vals[%d]=%f. Max=%f\n",
+            //             seq_idx, head_num_idx, sgid, sglid, current_token, tmp_print, qk_max);
+            qk_vals[current_token] = current_token >= context_len ? 0 : qk[sglid];
         }
     }
 
-    /* WARNING NEED TO ADD BIAS BEFORE SOFTMAX */
-
     // Apply SoftMax operation
     __local OUTPUT_TYPE qk_max_vals[SUBGROUPS_PER_WG];
     __local OUTPUT_TYPE qk_sum_vals[SUBGROUPS_PER_WG];
@@ -138,23 +185,35 @@ KERNEL(pa_sdpa_ref)(
         // Final max value after reduction across of all SG and WI
         qk_max = sub_group_reduce_max(qk_max);
 
+        // if (get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) {
+        //     printf("QK max value = %f\n", qk_max);
+        // }
+
         OUTPUT_TYPE exp_sum = OUTPUT_VAL_ZERO;
         for (uint qk_idx = 0; qk_idx < CEIL_DIV(context_len, SUBGROUPS_PER_WG * SUB_GROUP_SIZE); qk_idx++) {
             const uint data_idx = qk_idx * (SUBGROUPS_PER_WG * SUB_GROUP_SIZE) + sgid * SUB_GROUP_SIZE + sglid;
             if (data_idx < context_len) {
                 OUTPUT_TYPE val = native_exp(qk_vals[data_idx] - qk_max);
                 exp_sum += val;
                 qk_vals[data_idx] = val;
+                // if (head_num_idx < 4 || head_num_idx == 31)
+                //     printf("head_num %d, sgid = %d, sglid = %d, exp_sum = %f\n", head_num_idx, sgid, sglid, exp_sum);
             }
         }
 
         exp_sum = sub_group_reduce_add(exp_sum);
 
+        // if (get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) {
+        //     printf("exp_sum final value = %f\n", exp_sum);
+        // }
+
         if (sglid == 0)
             qk_sum_vals[sgid] = exp_sum;
 
         barrier(CLK_LOCAL_MEM_FENCE);
 
+        exp_sum = OUTPUT_VAL_ZERO;
+
         if (sglid < SUBGROUPS_PER_WG)
             exp_sum = qk_sum_vals[sglid];
 
@@ -163,6 +222,8 @@ KERNEL(pa_sdpa_ref)(
 
         const OUTPUT_TYPE inv_sum = OUTPUT_VAL_ONE / exp_sum;
 
+
+        // TODO: replace CEIL_DIV with ALIGN and use += SUBGROUPS_PER_WG * SUB_GROUP_SIZE increment
         for (uint qk_idx = 0; qk_idx < CEIL_DIV(context_len, SUBGROUPS_PER_WG * SUB_GROUP_SIZE); qk_idx++) {
             const uint data_idx = qk_idx * (SUBGROUPS_PER_WG * SUB_GROUP_SIZE) + sgid * SUB_GROUP_SIZE + sglid;
             if (data_idx < context_len) {
@@ -174,5 +235,61 @@ KERNEL(pa_sdpa_ref)(
         barrier(CLK_LOCAL_MEM_FENCE);
     }
 
-    output[seq_idx + sglid] = qk_vals[sglid % context_len];
+    // if (seq_idx == 0 && sgid == 0 && sglid == 0) {
+    //     for (uint i = 0; i < context_len; i++) {
+    //         printf("Softmax res for %d head: %d. %f\n", head_num_idx, i, qk_vals[i]);
+    //     }
+    // }
+
+    {
+        OUTPUT_TYPE acc = OUTPUT_VAL_ZERO;
+
+        for (uint qk_idx = 0; qk_idx < ALIGN(context_len, SUB_GROUP_SIZE); qk_idx += SUB_GROUP_SIZE) {
+            const uint qk_offset = qk_idx + sglid;
+
+            OUTPUT_TYPE qk = qk_offset < context_len ? qk_vals[qk_offset] : OUTPUT_VAL_ZERO;
+
+            const uint block_idx = block_tables[batch_idx * blocks_num + (qk_idx / SUB_GROUP_SIZE)];
+            if (block_idx == 0)
+                continue;
+
+            const uint value_cache_offset = block_idx * KV_CACHE_BLOCK_STRIDE +
+                                            (head_num_idx / NUM_QUERIES_PER_KV_HEAD) * (HEAD_SIZE * BLOCK_SIZE) +
+                                            sgid * (SUB_GROUP_SIZE * BLOCK_SIZE) +
+                                            sglid * BLOCK_SIZE;
+
+            #define VALUE_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE)
+            #define VALUE_VLOAD(offset, ptr) CAT(vload, BLOCK_SIZE)(offset, ptr)
+
+            ushort16 v_tmp = vload16(0, (__global ushort*)(value_cache + value_cache_offset));
+            OUTPUT_TYPE* v = (OUTPUT_TYPE*)&v_tmp;
+
+            // VALUE_VEC_TYPE* tmp_print = v;
+
+            // if (seq_idx == 0 && head_num_idx == 0) {
+            //     printf("gemm2: seq_idx=%d, head_num_idx=%d, sgid=%d, sglid=%d, block_idx=%d, qk_idx=%d, qk_offset=%d, value_offset=%d (block_offset=%d): %v8f\n",
+            //         seq_idx, head_num_idx, sgid, sglid, block_idx, qk_idx, qk_offset, value_cache_offset - (block_idx * KV_CACHE_BLOCK_STRIDE), block_idx * KV_CACHE_BLOCK_STRIDE, *tmp_print);
+            // }
+
+            for (uint token = 0; token < BLOCK_SIZE; token++) {
+                OUTPUT_TYPE qk_tmp = sub_group_broadcast(qk, token);
+                if (qk_idx + token < context_len) {
+                    acc = mad(qk_tmp, v[token], acc);
+                }
+            }
+        }
+
+
+        const uint output_offset = seq_idx * (HEADS_NUM * HEAD_SIZE) +
+                                   head_num_idx * HEAD_SIZE +
+                                   sgid * SUB_GROUP_SIZE +
+                                   sglid;
+
+        // if (seq_idx == 0 && head_num_idx < 2 || head_num_idx == 31) {
+        //     printf("output res: seq_idx=%d, head_num_idx=%d, sgid=%d, sglid=%d: output[%d] = %f\n",
+        //         seq_idx, head_num_idx, sgid, sglid, output_offset, acc);
+        // }
+
+        output[output_offset] = acc;
+    }
 }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/paged_attention/kv_cache_update_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/paged_attention/kv_cache_update_kernel_ref.cpp
@@ -117,6 +117,8 @@ bool KVCacheUpdateKernelRef::Validate(const Params& params) const {
 JitConstants KVCacheUpdateKernelRef::GetJitConstants(const kv_cache_update_params& kernel_params, KernelMode mode) const {
     JitConstants jit = MakeBaseParamsJitConstants(kernel_params);
 
+    GPU_DEBUG_TRACE << "Configure kernel for " << static_cast<int>(mode) << "\n";
+
     if (mode == KernelMode::key_cache_update)
         jit.AddConstant(MakeJitConstant("KEY_CACHE_UPDATE", 1));
     else
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/paged_attention/sdpa_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/paged_attention/sdpa_kernel_ref.cpp
@@ -119,6 +119,7 @@ JitConstants SDPAKernelRef::GetJitConstants(const sdpa_params& kernel_params) co
     jit.AddConstant(MakeJitConstant("HEAD_SIZE", HEAD_SIZE));
     jit.AddConstant(MakeJitConstant("HEADS_NUM", HEADS_NUM));
     jit.AddConstant(MakeJitConstant("KV_HEADS_NUM", KV_HEADS_NUM));
+    jit.AddConstant(MakeJitConstant("NUM_QUERIES_PER_KV_HEAD", HEADS_NUM / KV_HEADS_NUM));
     jit.AddConstant(MakeJitConstant("BLOCK_SIZE", BLOCK_SIZE));
     jit.AddConstant(MakeJitConstant("X_SIZE", X_SIZE));
 
@@ -140,7 +141,7 @@ CommonDispatchData SDPAKernelRef::SetDefault(const sdpa_params& kernel_params) {
         dispatch_data.gws = { tokens_num,
                               kernel_params.configuration.heads_num,
                               kernel_params.configuration.head_size };
-        dispatch_data.lws = { 1, 1, 16 };
+        dispatch_data.lws = { 1, 1, kernel_params.configuration.head_size };
     }
 
     return dispatch_data;