[GPU] PA, rotation minor fixes

p-durandin · vshampor · commit 84de26d978d0 · 2025-01-13T18:47:16.000+01:00
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp
@@ -214,6 +214,12 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
                 if (desc->has_alibi) {
                     args.inputs.push_back(instance.alibi_memory_ptr());
                 }
+
+                if (desc->has_rotated_blocks) {
+                    args.inputs.push_back(instance.rotated_block_indices_memory_ptr());
+                    args.inputs.push_back(instance.rotation_deltas_memory_ptr());
+                    args.inputs.push_back(instance.rotation_trig_lut_memory_ptr());
+                }
             } else if (kernel_idx == 2 || kernel_idx == 3) {
                 // Finalization kernel or mixed stage finalization kernel
                 args.inputs = { instance.past_lens_memory_ptr() };
@@ -681,6 +687,10 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         if (has_alibi)
             inputs_number++;
 
+        const auto has_rotation = impl_param.input_layouts.size() == 16;
+        if (has_rotation)
+            inputs_number += 3;
+
         auto input_idx = 0;
         params.inputs.resize(inputs_number);
         params.inputs[input_idx++] = query_tensor;
@@ -699,6 +709,12 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         if (has_alibi)
             params.inputs[input_idx++] = alibi_tensor;
 
+         if (has_rotation) {
+            params.inputs[input_idx++] = input_tensors[13];
+            params.inputs[input_idx++] = input_tensors[14];
+            params.inputs[input_idx++] = input_tensors[15];
+        }
+
         if (has_scores_output) {
             params.outputs.resize(2);
             params.outputs[1] = convert_data_tensor(impl_param.get_output_layout(1));
@@ -736,6 +752,12 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
         if (has_alibi)
             in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(11)});
 
+        if (has_rotation) {
+            in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(13)});
+            in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(14)});
+            in_tensor_to_offset_map.insert({input_idx++, in_offsets_map.at(15)});
+        }
+
         if (has_scores_output)
             out_tensor_to_offset_map.insert({1, out_offsets_map.at(1)});
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl
@@ -43,10 +43,11 @@ KERNEL(pa_sdpa_opt)(
 #if HAS_ALIBI
     const __global ALIBI_INPUT_TYPE* alibi_slopes,
 #endif
+
 #if HAS_ROTATED_BLOCKS
-    const __global INPUT8_TYPE* rotated_block_indices,
-    const __global INPUT9_TYPE* rotation_deltas,
-    const __global INPUT10_TYPE* rotation_trig_lut,
+    const __global INPUT7_TYPE* rotated_block_indices,
+    const __global INPUT8_TYPE* rotation_deltas,
+    const __global INPUT9_TYPE* rotation_trig_lut,
 #endif
     __global OUTPUT_TYPE* output,
 #if PAGED_ATTENTION_SCORES_OUTPUT
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
@@ -1004,6 +1004,7 @@ KERNEL(sdpa_opt)(
         const uint partition_seq_len = min((uint)SOURCE_SEQ_LEN - start_partition_idx, (uint)SEQ_LEN_PARTITION_SIZE);
 #endif
 
+MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_acc = INPUT0_VAL_ZERO;
 #if IS_CAUSAL
         if (seq_len <= target_seq_idx) { // keep tril i.e. m >= n
 #endif
@@ -1037,11 +1038,7 @@ KERNEL(sdpa_opt)(
 #endif
 
             int seq_len_calc_size = min((int)(SOURCE_SEQ_LEN) - (int)seq_len, (int)SUBGROUP_SIZE);
-#if IS_CAUSAL
-            MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_acc = INPUT0_VAL_ZERO;
-#else  // !IS_CAUSAL
-            MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_acc;
-
+#if !IS_CAUSAL
             qk_acc = FUNC_CALL(load_attn_mask)(OPTIONAL_SHAPE_INFO_TENSOR
                             b0_idx,
                             b1_idx,
@@ -1279,39 +1276,6 @@ KERNEL(sdpa_opt)(
                 }
             }
 
-#if PAGED_ATTENTION_SCORES_OUTPUT
-            const uint subsequence_idx = gws_seq_indexes_correspondence[target_seq_dim];
-            const uint subsequence_end_pos = subsequence_begins[subsequence_idx + 1];
-            const uint block_start_pos = blocked_indexes_start[target_seq_dim];
-            const uint block_end_pos = blocked_indexes_end[target_seq_dim];
-
-            // PagedAttention is supposed to save only last "row" of the QK matrix multiplication,
-            // so save SEQ_LEN_PARTITION_SIZE elements for each partition
-            if (subsequence_end_pos == block_end_pos) {
-                const uint last_row_idx = block_end_pos - block_start_pos - 1;
-                if (sglid == last_row_idx) {
-                    const uint partition_idx = start_partition_idx / SEQ_LEN_PARTITION_SIZE;
-
-                    if (sgid == 0) {
-                        const uint max_partitions_num = aligned_max_context_len / SEQ_LEN_PARTITION_SIZE;
-                        const uint exp_sums_output_offset = subsequence_idx * NUM_HEADS * max_partitions_num +
-                                                            num_heads_dim * max_partitions_num +
-                                                            partition_idx;
-                        exp_sums[exp_sums_output_offset] = exp_sum_new;
-                        max_logits[exp_sums_output_offset] = qk_max_new;
-                    }
-
-                    const uint output_offset = subsequence_idx * NUM_HEADS * aligned_max_context_len +
-                                               num_heads_dim * aligned_max_context_len +
-                                               partition_idx * SEQ_LEN_PARTITION_SIZE + sgid * TARGET_SEQ_LEN_BLOCK_SIZE;
-                    for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) {
-                        softmax_results[output_offset + i] = qk_acc[i];
-                    }
-
-                }
-            }
-#endif
-
             barrier(CLK_LOCAL_MEM_FENCE);
         }