Skip to content

Commit 02a34f6

Browse files
committed
WA: Add PA explicit configuration via env variable
1 parent 59e4d7e commit 02a34f6

File tree

2 files changed

+60
-0
lines changed

2 files changed

+60
-0
lines changed

src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp

+27
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,33 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
163163
config.x_block_size = desc->x_block_size;
164164
config.max_context_len = 1;
165165

166+
if (!impl_param.is_dynamic()) {
167+
auto query_shape = impl_param.get_input_layout(0).get_shape();
168+
auto key_cache_shape = impl_param.get_input_layout(3).get_shape();
169+
auto value_cache_shape = impl_param.get_input_layout(4).get_shape();
170+
171+
auto actual_head_size = value_cache_shape[2];
172+
auto actual_heads_num = query_shape[2] / actual_head_size;
173+
auto actual_kv_heads_num = value_cache_shape[1];
174+
auto actual_block_size = value_cache_shape[3];
175+
auto actual_x_block_size = key_cache_shape[4];
176+
177+
bool valid_params = config.head_size == actual_head_size &&
178+
config.heads_num == actual_heads_num &&
179+
config.kv_heads_num == actual_kv_heads_num &&
180+
config.block_size == actual_block_size &&
181+
config.x_block_size == actual_x_block_size;
182+
183+
OPENVINO_ASSERT(valid_params, "[GPU] Got unexpected parameters for PA operation. ",
184+
"Currently they need to be specified explicitly (this should be fixed soon by PA model conversion improvement). ",
185+
"Please use the following environment variables for proper PA configuration: ",
186+
"PA_HEAD_SIZE=", actual_head_size, " ",
187+
"PA_HEADS_NUM=", actual_heads_num, " ",
188+
"PA_KV_HEADS_NUM=", actual_kv_heads_num, " ",
189+
"PA_BLOCK_SIZE=", actual_block_size, " ",
190+
"PA_X_BLOCK_SIZE=", actual_x_block_size);
191+
}
192+
166193
const size_t simd_size = 16;
167194
OPENVINO_ASSERT(config.head_size % simd_size == 0, "[GPU] Head size is expected to be divisible by 16");
168195

src/plugins/intel_gpu/src/plugin/ops/custom.cpp

+33
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,19 @@ class CustomLayerAttributeVisitor : public ov::AttributeVisitor {
101101
std::map<std::string, std::string> m_values;
102102
};
103103

104+
template <typename T>
105+
T convert_to(const std::string &str) {
106+
std::istringstream ss(str);
107+
T res;
108+
ss >> res;
109+
return res;
110+
}
111+
112+
template <>
113+
std::string convert_to(const std::string &str) {
114+
return str;
115+
}
116+
104117
void CreatePagedAttention(ProgramBuilder& p, const std::shared_ptr<ov::Node>& op) {
105118
validate_inputs_count(op, {13});
106119
auto inputs = p.GetInputInfo(op);
@@ -126,6 +139,26 @@ void CreatePagedAttention(ProgramBuilder& p, const std::shared_ptr<ov::Node>& op
126139
prim.block_size = 16;
127140
prim.x_block_size = 8;
128141

142+
if (const auto env_var = std::getenv("PA_HEAD_SIZE")) {
143+
prim.head_size = convert_to<size_t>(env_var);
144+
}
145+
146+
if (const auto env_var = std::getenv("PA_HEADS_NUM")) {
147+
prim.heads_num = convert_to<size_t>(env_var);
148+
}
149+
150+
if (const auto env_var = std::getenv("PA_KV_HEADS_NUM")) {
151+
prim.kv_heads_num = convert_to<size_t>(env_var);
152+
}
153+
154+
if (const auto env_var = std::getenv("PA_BLOCK_SIZE")) {
155+
prim.block_size = convert_to<size_t>(env_var);
156+
}
157+
158+
if (const auto env_var = std::getenv("PA_X_BLOCK_SIZE")) {
159+
prim.x_block_size = convert_to<size_t>(env_var);
160+
}
161+
129162
prim.num_outputs = op->get_output_size();
130163
prim.output_data_types = get_output_data_types(op);
131164
prim.output_paddings = get_output_paddings(op);

0 commit comments

Comments
 (0)