Skip to content

Commit 67bcef1

Browse files
Slice the last matmull in stateful llm pipeline (openvinotoolkit#814)
Ticket: CVS-154175 Co-authored-by: Andrei Kochin <andrei.kochin@intel.com>
1 parent e5ec1cc commit 67bcef1

File tree

4 files changed

+40
-2
lines changed

4 files changed

+40
-2
lines changed

src/cpp/src/greedy_decoding.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ EncodedResults greedy_decoding(
7373
bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
7474
if (!generation_config.ignore_eos && all_are_eos)
7575
return results;
76-
7776

7877
for (size_t i = 0; i < max_new_tokens - 1; ++i) {
7978
if (position_ids.has_value())

src/cpp/src/llm_pipeline.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,15 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
8282
core.set_property(core_plugin_config);
8383
auto model = core.read_model(model_path / "openvino_model.xml");
8484
m_adapter_controller = AdapterController(model, m_generation_config.adapters, "base_model.model.model.", device); // TODO: Make the prefix name configurable
85+
utils::slice_matmul_statefull_model(model);
8586
m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
8687
m_adapter_controller->apply(m_model_runner, m_generation_config.adapters);
8788
} else {
8889
auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config);
8990
core.set_property(core_plugin_config);
90-
m_model_runner = core.compile_model(model_path / "openvino_model.xml", device, compile_plugin_config).create_infer_request();
91+
auto model = core.read_model(model_path / "openvino_model.xml");
92+
utils::slice_matmul_statefull_model(model);
93+
m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
9194
}
9295

9396
// If eos_token_id was not provided, take value

src/cpp/src/utils.cpp

+34
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,14 @@
55

66
#include <fstream>
77

8+
#include "openvino/op/add.hpp"
9+
#include "openvino/op/divide.hpp"
10+
#include "openvino/op/multiply.hpp"
11+
#include "openvino/op/matmul.hpp"
12+
#include "openvino/op/slice.hpp"
13+
#include "openvino/op/tanh.hpp"
14+
#include "openvino/op/transpose.hpp"
15+
816
namespace ov {
917
namespace genai {
1018
namespace utils {
@@ -225,6 +233,32 @@ ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::Token
225233

226234
return {new_input_ids, new_attention_mask};
227235
}
236+
237+
void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model) {
238+
ov::Node* matmul = nullptr;
239+
auto last_node = model->output(0).get_node()->input_value(0).get_node();
240+
if (matmul = dynamic_cast<ov::op::v0::MatMul*>(last_node)) {
241+
} else if(auto add = dynamic_cast<ov::op::v1::Add*>(last_node)) {
242+
matmul = dynamic_cast<ov::op::v0::MatMul*>(add->input_value(0).get_node());
243+
} else if (auto transpose = dynamic_cast<ov::op::v1::Transpose*>(last_node)) {
244+
matmul = dynamic_cast<ov::op::v0::MatMul*>(transpose->input_value(0).get_node());
245+
} else if (auto multiply = dynamic_cast<ov::op::v1::Multiply*>(last_node)) {
246+
if (auto tanh = dynamic_cast<ov::op::v0::Tanh*>(multiply->input_value(0).get_node())) {
247+
if (auto divide = dynamic_cast<ov::op::v1::Divide*>(tanh->input_value(0).get_node())) {
248+
matmul = dynamic_cast<ov::op::v0::MatMul*>(divide->input_value(0).get_node());
249+
}
250+
}
251+
}
252+
253+
if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
254+
auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
255+
auto stop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-2});
256+
auto step = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
257+
auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
258+
auto slice = std::make_shared<ov::op::v8::Slice>(matmul->input_value(0), start, stop, step, axis);
259+
matmul->input(0).replace_source_output(slice);
260+
}
261+
}
228262
} // namespace utils
229263
} // namespace genai
230264
} // namespace ov

src/cpp/src/utils.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ ProcessorConfig from_any_map(
8787
std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& plugin_config);
8888

8989
ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
90+
91+
void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model);
9092
} // namespace utils
9193
} // namespace genai
9294
} // namespace ov

0 commit comments

Comments
 (0)