You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Separate reset for KV state and LoRA state in LLMPipeline (openvinotoolkit#1058)
Fixing a bug when LoRA state is experienced reset each time when
generate is invoked that brought unnecessary overhead in each generate
call even if LoRA tensors/alphas are not changed.
// the next call of apply will set all adapter tensors regardless of config change, use this method if full state.reset is called for the controlled model
194
-
voidforce_full_apply(bool full_apply = true);
193
+
// Returns true if a given name is one of the state names created by this adapter controller for dynamic LoRA
194
+
// Helps to distinguish LoRA states from other states (e.g. KV cache state) in the model for a partial state reset.
@@ -273,11 +284,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
273
284
}
274
285
275
286
if (!is_chat_conversation) {
276
-
// FIXME: Reset only KV cache part of state, there is also can be LoRA applied in the states and full reset will need to reapply LoRA even if the LoRA config is not changed
277
-
m_model_runner.reset_state();
278
-
if(m_adapter_controller) {
279
-
m_adapter_controller->force_full_apply(); // FIXME: Reset only KV cache part to avoid this call
280
-
}
287
+
reset_kv_state();
281
288
m_selected_beam = std::nullopt;
282
289
} else {
283
290
m_is_cache_empty = false;
@@ -297,7 +304,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
297
304
is_chat_conversation = true;
298
305
m_selected_beam = std::nullopt;
299
306
if (!m_is_cache_empty) {
300
-
m_model_runner.reset_state();
307
+
reset_kv_state();
301
308
m_is_cache_empty = true;
302
309
m_history = {};
303
310
m_templated_chat_history = "";
@@ -315,7 +322,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
0 commit comments