[GPU] FP32 acc for 2nd+ token PagedAttention

sshlyapn · sshlyapn · commit 8a5f0eafa36b · 2025-01-23T20:08:28.000+04:00
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp
@@ -55,7 +55,19 @@ struct crop_impl : typed_primitive_impl_ocl<crop> {
         }
 
         update_shapes(*_kernel_data.params, impl_param);
-        auto runtime_offset = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]).GetFirstElementOffset();
+
+        // Reset input_layout padding as the offset configured by crop should affect only "data"
+        // area and shouldn't depend on input_layout paddings.
+        // For example, for an input shape like: [1, 32, 128 (pad_before=512, pad_after=0), 8]
+        // with crop_axis=2 and split_lengths = {64, 64},
+        // runtime_offset should be set in terms of [1, 32, 128, 8] shape, as the kernel reads data
+        // using "input[GET_INDEX(INPUT, order) + runtime_offset]", where GET_INDEX already reflects input
+        // data paddings.
+        // So crop.out0's runtime_offset=0 and crop.out1's runtime_offset=512.
+        auto input_layout = impl_param.get_input_layout();
+        input_layout.data_padding = padding();
+
+        auto runtime_offset = convert_data_tensor(input_layout, impl_param.input_offsets[0]).GetFirstElementOffset();
         kernel_selector::ScalarDescriptor s;
         s.t = kernel_selector::ScalarDescriptor::Types::UINT32;
         s.v.u32 = static_cast<uint32_t>(runtime_offset);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl
@@ -107,7 +107,7 @@ KERNEL(pa_sdpa_opt)(
 #endif
 
     // SLM for intermediate QK results
-    __local OUTPUT_TYPE slm_qk_vals[SEQ_LEN_PARTITION_SIZE];
+    __local SOFTMAX_ACCUMULATOR_TYPE slm_qk_vals[SEQ_LEN_PARTITION_SIZE];
 
     // SLM buffers for SoftMax calculation and qk_max/qk_sums results aggregation across all WGs
     __local SOFTMAX_ACCUMULATOR_TYPE slm_qk_max_vals[SUBGROUPS_PER_WG];
@@ -166,7 +166,7 @@ KERNEL(pa_sdpa_opt)(
 #endif
             const uint block_offset = block_indices[start_block_idx + block_num * SUBGROUPS_PER_WG] * HEAD_SIZE * KV_HEADS_NUM * SUBGROUP_SIZE + head_idx * HEAD_SIZE * SUBGROUP_SIZE;
 
-            INPUT0_TYPE qk_acc = INPUT0_VAL_ZERO;
+            SOFTMAX_ACCUMULATOR_TYPE qk_acc = SOFTMAX_ACCUMULATOR_VAL_ZERO;
 
             #define KEY_VEC_SIZE SUBGROUP_SIZE
             unroll_for (uint qk_idx = 0; qk_idx < HEAD_SIZE / KEY_VEC_SIZE; qk_idx++) {
@@ -181,9 +181,9 @@ KERNEL(pa_sdpa_opt)(
 
                 unroll_for (uint i = 0; i < KEY_VEC_SIZE; i++) {
 #if STORE_QUERY_TO_SLM
-                    qk_acc = mad(sub_group_broadcast(q_val, i), k_vals[i], qk_acc);
+                    qk_acc = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(sub_group_broadcast(q_val, i)), TO_SOFTMAX_ACCUMULATOR_TYPE(k_vals[i]), qk_acc);
 #else
-                    qk_acc = mad(sub_group_broadcast(q_val[qk_idx], i), k_vals[i], qk_acc);
+                    qk_acc = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(sub_group_broadcast(q_val[qk_idx], i)), TO_SOFTMAX_ACCUMULATOR_TYPE(k_vals[i]), qk_acc);
 #endif
                 }
             }
@@ -196,7 +196,7 @@ KERNEL(pa_sdpa_opt)(
 #endif
 
             if (token_idx >= seq_len)
-                qk_acc = INPUT0_VAL_MIN;
+                qk_acc = SOFTMAX_ACCUMULATOR_VAL_MIN;
 
             qk_max = SOFTMAX_ACCUMULATOR_MAX_FUNC(qk_max, TO_SOFTMAX_ACCUMULATOR_TYPE(qk_acc));
 
@@ -235,7 +235,7 @@ KERNEL(pa_sdpa_opt)(
             if (global_data_idx < seq_len && local_data_idx < SEQ_LEN_PARTITION_SIZE) {
 #endif
                 SOFTMAX_ACCUMULATOR_TYPE qk_new = native_exp(TO_SOFTMAX_ACCUMULATOR_TYPE(slm_qk_vals[local_data_idx]) - qk_max);
-                slm_qk_vals[local_data_idx] = TO_OUTPUT_TYPE(qk_new);
+                slm_qk_vals[local_data_idx] = qk_new;
 
                 exp_sum += qk_new;
             }
@@ -266,7 +266,7 @@ KERNEL(pa_sdpa_opt)(
             if (global_data_idx < seq_len && local_data_idx < SEQ_LEN_PARTITION_SIZE) {
 #endif
                 SOFTMAX_ACCUMULATOR_TYPE qk_new = TO_SOFTMAX_ACCUMULATOR_TYPE(slm_qk_vals[local_data_idx]) / exp_sum;
-                slm_qk_vals[local_data_idx] = TO_OUTPUT_TYPE(qk_new);
+                slm_qk_vals[local_data_idx] = qk_new;
             }
         }
 
diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -799,6 +799,20 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
     auto& engine = m_graph->get_engine();
     auto& stream = network->get_stream();
 
+    if (internal_name == "parameter:input_ids") {
+        auto data = user_tensor->data<int64_t>();
+
+        auto print_arr = [&](int64_t* vec, size_t max_len, std::string name) {
+            std::stringstream ss;
+            for (size_t i = 0; i < max_len; i++) {
+                ss << vec[i] << ", ";
+            }
+            std::cout << "Array " << name << " (len=" << max_len << ") content: " << ss.str() << "\n";
+        };
+
+        print_arr(data, user_tensor->get_size(), "input_ids");
+    }
+
     auto need_lockable_mem = network->does_node_need_lockable_output(internal_name);
 
     OPENVINO_ASSERT(pshape.compatible(ov::PartialShape(user_tensor->get_shape())) || is_batched_input(port),
diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
@@ -34,13 +34,25 @@ class PerformanceModeValidator : public BaseValidator {
 };
 
 void ExecutionConfig::set_default() {
+    auto default_inference_precision_hint = ov::element::f16;
+    int USE_FP32 = 0;
+    if (const auto env_var = std::getenv("USE_FP32")) {
+        std::istringstream ss(env_var);
+        ss >> USE_FP32;
+    }
+
+    if (USE_FP32) {
+        default_inference_precision_hint = ov::element::f32;
+        std::cout << "inference_precision forced to f32\n";
+    }
+
     register_property<PropertyVisibility::PUBLIC>(
         std::make_tuple(ov::device::id, "0"),
         std::make_tuple(ov::enable_profiling, false),
         std::make_tuple(ov::cache_dir, ""),
         std::make_tuple(ov::num_streams, 1),
         std::make_tuple(ov::compilation_num_threads, std::max(1, static_cast<int>(std::thread::hardware_concurrency()))),
-        std::make_tuple(ov::hint::inference_precision, ov::element::f16, InferencePrecisionValidator()),
+        std::make_tuple(ov::hint::inference_precision, default_inference_precision_hint, InferencePrecisionValidator()),
         std::make_tuple(ov::hint::model_priority, ov::hint::Priority::MEDIUM),
         std::make_tuple(ov::hint::performance_mode, ov::hint::PerformanceMode::LATENCY, PerformanceModeValidator()),
         std::make_tuple(ov::hint::execution_mode, ov::hint::ExecutionMode::PERFORMANCE),