8
8
#include " openvino/core/model.hpp"
9
9
#include " openvino/op/loop.hpp"
10
10
#include " openvino/op/lstm_sequence.hpp"
11
+ #include " openvino/op/paged_attention.hpp"
11
12
#include " openvino/op/search_sorted.hpp"
12
13
#include " openvino/op/stft.hpp"
13
14
#include " ov_ops/dynamic_quantize.hpp"
@@ -135,6 +136,7 @@ void ExecutionConfig::apply_model_specific_options(const IRemoteContext* context
135
136
136
137
const auto & ops = model.get_ops ();
137
138
139
+ auto is_paged_attention_model = false ;
138
140
std::function<void (std::shared_ptr<Node>)> process_op = [&, this ](std::shared_ptr<Node> op) {
139
141
if (requires_new_shape_infer (op)) {
140
142
m_allow_new_shape_infer = true ;
@@ -158,12 +160,28 @@ void ExecutionConfig::apply_model_specific_options(const IRemoteContext* context
158
160
}
159
161
}
160
162
}
163
+
164
+ if (ov::is_type<ov::op::PagedAttentionExtension>(op)) {
165
+ is_paged_attention_model = true ;
166
+ }
161
167
};
162
168
163
169
for (const auto & op : ops) {
164
170
process_op (op);
165
171
}
166
172
173
+ const auto & info = dynamic_cast <const RemoteContextImpl*>(context)->get_engine ().get_device_info ();
174
+ if (!is_set_by_user (ov::hint::kv_cache_precision) || get_kv_cache_precision () == ov::element::dynamic) {
175
+ if (is_paged_attention_model || !info.supports_immad ) {
176
+ // Enable KV-cache compression by default for:
177
+ // 1) Non-systolic platforms in case of SDPA-based models
178
+ // 2) For any platforms in case of PagedAttention-based model
179
+ m_kv_cache_precision = ov::element::i8;
180
+ } else {
181
+ m_kv_cache_precision = get_inference_precision ();
182
+ }
183
+ }
184
+
167
185
m_optimize_data = true ;
168
186
}
169
187
@@ -185,15 +203,6 @@ void ExecutionConfig::finalize_impl(const IRemoteContext* context) {
185
203
m_queue_type = QueueTypes::in_order;
186
204
}
187
205
188
- if (!is_set_by_user (ov::hint::kv_cache_precision) || get_kv_cache_precision () == ov::element::dynamic) {
189
- if (info.supports_immad ) { // MFDNN-11755
190
- m_kv_cache_precision = get_inference_precision ();
191
- } else {
192
- // Enable KV-cache compression by default for non-systolic platforms only
193
- m_kv_cache_precision = ov::element::i8;
194
- }
195
- }
196
-
197
206
// Enable dynamic quantization by default for non-systolic platforms
198
207
if (!is_set_by_user (ov::hint::dynamic_quantization_group_size) && get_dynamic_quantization_group_size () == 0 && !info.supports_immad ) {
199
208
m_dynamic_quantization_group_size = 32 ;
@@ -203,6 +212,11 @@ void ExecutionConfig::finalize_impl(const IRemoteContext* context) {
203
212
m_optimize_data = true ;
204
213
}
205
214
215
+ // Replace UINT8 KV-cache compression data type with INT8, as plugin is supposed to work with INT8 internally
216
+ if (get_kv_cache_precision () == ov::element::u8) {
217
+ m_kv_cache_precision = ov::element::i8;
218
+ }
219
+
206
220
#ifdef ENABLE_DEBUG_CAPS
207
221
// For now we apply env/config only for build with debug caps, but it can be updated in the future to allow
208
222
// reading release options for any build type
0 commit comments