Fix kernel arguments and adjust min seq_len

sshlyapn · sshlyapn · commit aa6ff61e72d2 · 2025-03-11T10:08:17.000+04:00
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp
@@ -238,7 +238,7 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
 
             args.outputs = { instance.output_memory_ptr(0) };
         } else if (stage == Stage::PA_SDPA) {
-            if (kernel_idx == 0 || kernel_idx == 1) {
+            if (kernel_idx == 0 || kernel_idx == 1 || kernel_idx == 2) {
                 // 2nd+ token calculation or mixed stage tokens calculation
                 args.shape_info = instance.shape_info_memory_ptr();
 
@@ -262,7 +262,7 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
                 if (desc->has_alibi) {
                     args.inputs.push_back(instance.alibi_memory_ptr());
                 }
-            } else if (kernel_idx == 2 || kernel_idx == 3) {
+            } else if (kernel_idx == 3 || kernel_idx == 4) {
                 // Finalization kernel or mixed stage finalization kernel
                 args.inputs = { instance.past_lens_memory_ptr() };
 
@@ -276,15 +276,15 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
                     args.inputs.push_back(instance.rotation_deltas_memory_ptr());
                     args.inputs.push_back(instance.rotation_trig_lut_memory_ptr());
                 }
-            } else if (kernel_idx == 4) {
+            } else if (kernel_idx == 5) {
                 // Output scores calculation kernel
                 args.inputs = { instance.past_lens_memory_ptr(),
                                 instance.subsequence_begins_memory_ptr() };
             }
 
             args.outputs = { instance.output_memory_ptr(0) };
 
-            if (kernel_idx == 4) {
+            if (kernel_idx == 5) {
                 args.outputs.push_back(instance.output_memory_ptr(1));
             }
         }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp
@@ -369,8 +369,8 @@ void PagedAttentionSDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) cons
         const auto scores_calc_only = prim_params.stage == PagedAttentionStage::PREFILL && has_scores_output;
         const auto multi_tokens_mode = prim_params.stage == PagedAttentionStage::MIXED;
 
-        // Apply GQA optimization starting from a certain sequence length value
-        const auto min_gqa_sequence_len = 8 * seq_len_partition_size;
+        // Apply GQA optimization starting from a certain sequence length (4K tokens) value
+        const auto min_gqa_sequence_len = 16 * seq_len_partition_size;
         // Apply GQA only if there is a single subsequence in the request,
         // as multiple subsequences might have significantly different lengths
         const auto max_subsequences_num = 1;

Original file line number	Diff line number	Diff line change
`@@ -238,7 +238,7 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {`
`238`	`238`
`239`	`239`	`args.outputs = { instance.output_memory_ptr(0) };`
`240`	`240`	`} else if (stage == Stage::PA_SDPA) {`
`241`		`- if (kernel_idx == 0 \|\| kernel_idx == 1) {`
	`241`	`+ if (kernel_idx == 0 \|\| kernel_idx == 1 \|\| kernel_idx == 2) {`
`242`	`242`	`// 2nd+ token calculation or mixed stage tokens calculation`
`243`	`243`	`args.shape_info = instance.shape_info_memory_ptr();`
`244`	`244`
`@@ -262,7 +262,7 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {`
`262`	`262`	`if (desc->has_alibi) {`
`263`	`263`	`args.inputs.push_back(instance.alibi_memory_ptr());`
`264`	`264`	`}`
`265`		`- } else if (kernel_idx == 2 \|\| kernel_idx == 3) {`
	`265`	`+ } else if (kernel_idx == 3 \|\| kernel_idx == 4) {`
`266`	`266`	`// Finalization kernel or mixed stage finalization kernel`
`267`	`267`	`args.inputs = { instance.past_lens_memory_ptr() };`
`268`	`268`
`@@ -276,15 +276,15 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {`
`276`	`276`	`args.inputs.push_back(instance.rotation_deltas_memory_ptr());`
`277`	`277`	`args.inputs.push_back(instance.rotation_trig_lut_memory_ptr());`
`278`	`278`	`}`
`279`		`- } else if (kernel_idx == 4) {`
	`279`	`+ } else if (kernel_idx == 5) {`
`280`	`280`	`// Output scores calculation kernel`
`281`	`281`	`args.inputs = { instance.past_lens_memory_ptr(),`
`282`	`282`	`instance.subsequence_begins_memory_ptr() };`
`283`	`283`	`}`
`284`	`284`
`285`	`285`	`args.outputs = { instance.output_memory_ptr(0) };`
`286`	`286`
`287`		`- if (kernel_idx == 4) {`
	`287`	`+ if (kernel_idx == 5) {`
`288`	`288`	`args.outputs.push_back(instance.output_memory_ptr(1));`
`289`	`289`	`}`
`290`	`290`	`}`