Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

StaticLLMPipeline: Simplify compile_model call logic #1915

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions src/cpp/src/llm_pipeline_stateful.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,15 @@ StatefulLLMPipeline::StatefulLLMPipeline(
tokenizer,
device,
properties,
utils::from_config_json_if_exists(models_path),
models_path
utils::from_config_json_if_exists(models_path)
} {}

StatefulLLMPipeline::StatefulLLMPipeline(
const std::shared_ptr<ov::Model>& model,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config,
const std::filesystem::path& models_path)
const ov::genai::GenerationConfig& generation_config)
: LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
utils::apply_slice_before_matmul_transformation(model);
auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
Expand All @@ -70,9 +68,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
ov::CompiledModel compiled_model;
if (m_is_npu) {
utils::KVDesc kv_desc;
std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
model, *filtered_properties, kv_pos, models_path / "openvino_model.xml"
);
std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(model, *filtered_properties, kv_pos);
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
} else {
compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);
Expand Down
3 changes: 1 addition & 2 deletions src/cpp/src/llm_pipeline_stateful.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& config,
const ov::genai::GenerationConfig& generation_config,
const std::filesystem::path& models_path = {}
const ov::genai::GenerationConfig& generation_config
);

StatefulLLMPipeline(
Expand Down
16 changes: 5 additions & 11 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,23 +102,19 @@ StatefulLLMPipeline::StatefulLLMPipeline(
): StatefulLLMPipeline(
genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config),
tokenizer, config,
utils::from_config_json_if_exists(models_path),
models_path
utils::from_config_json_if_exists(models_path)
) {
}

StatefulLLMPipeline::StatefulLLMPipeline(
const std::shared_ptr<ov::Model>& model,
const ov::genai::Tokenizer& tokenizer,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config,
const std::filesystem::path& models_path
const ov::genai::GenerationConfig& generation_config
) : LLMPipelineImplBase(tokenizer, generation_config),
m_sampler(m_tokenizer) {
auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
auto [compiled, kv_desc] = utils::compile_decoder_for_npu(
model, properties, kv_pos, models_path / "openvino_model.xml"
);
auto [compiled, kv_desc] = utils::compile_decoder_for_npu(model, properties, kv_pos);
m_max_prompt_len = kv_desc.max_prompt_len;
m_kvcache_total = kv_desc.max_prompt_len + kv_desc.min_response_len;
m_request = compiled.create_infer_request();
Expand Down Expand Up @@ -358,16 +354,14 @@ LLMPipelineFactory::create(const std::filesystem::path& models_path,
std::unique_ptr<LLMPipelineImplBase> LLMPipelineFactory::create(const std::shared_ptr<ov::Model>& model,
const ov::genai::Tokenizer& tokenizer,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config,
const std::filesystem::path& models_path) {
const ov::genai::GenerationConfig& generation_config) {
auto properties_copy = properties;
const auto pipeline_mode = str_to_pipeline(utils::pop_or_default(properties_copy, "STATIC_PIPELINE", std::string("STATEFUL")));
if (pipeline_mode == StaticPipelineKind::STATEFUL) {
return std::make_unique<ov::genai::static_llm::StatefulLLMPipeline>(model,
tokenizer,
properties_copy,
generation_config,
models_path);
generation_config);
}
OPENVINO_ASSERT(false);
}
Expand Down
6 changes: 2 additions & 4 deletions src/cpp/src/llm_pipeline_static.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ struct LLMPipelineFactory {
static std::unique_ptr<LLMPipelineImplBase> create(const std::shared_ptr<ov::Model>& model,
const ov::genai::Tokenizer& tokenizer,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config,
const std::filesystem::path& models_path = {});
const ov::genai::GenerationConfig& generation_config);
};

class StatefulLLMPipeline : public LLMPipelineImplBase {
Expand All @@ -39,8 +38,7 @@ class StatefulLLMPipeline : public LLMPipelineImplBase {
const std::shared_ptr<ov::Model>& model,
const ov::genai::Tokenizer& tokenizer,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config,
const std::filesystem::path& path = {}
const ov::genai::GenerationConfig& generation_config
);

DecodedResults generate(
Expand Down
15 changes: 2 additions & 13 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,8 +418,7 @@ void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const ch
std::pair<ov::CompiledModel, KVDesc>
compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
const ov::AnyMap& config,
const KVAxesPosition& kv_pos,
const std::filesystem::path& model_path) {
const KVAxesPosition& kv_pos) {
ov::CompiledModel compiled;
ov::AnyMap properties = config;
KVDesc kv_desc;
Expand All @@ -443,17 +442,7 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
kv_desc.max_prompt_len = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
kv_desc.min_response_len = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u);
update_npu_config(properties, model, kv_pos, kv_desc);
auto cache_mode = get_option<CacheMode>(config, ov::cache_mode.name());
// NB: Select OPTIMIZE_SPEED with model_path isn't provided
if ((cache_mode.has_value() && *cache_mode == CacheMode::OPTIMIZE_SPEED)) {
compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
} else if (model_path.empty()) {
// Set config to OPTIMIZE_SPEED
properties[ov::cache_mode.name()] = CacheMode::OPTIMIZE_SPEED;
compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
} else {
compiled = ov::genai::utils::singleton_core().compile_model(model_path, "NPU", properties);
}
compiled = ov::genai::utils::singleton_core().compile_model(model, "NPU", properties);
// Also export compiled model if required
if (export_blob) {
if (blob_path.empty()) {
Expand Down
3 changes: 1 addition & 2 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,7 @@ struct KVDesc {

std::pair<ov::CompiledModel, KVDesc> compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
const ov::AnyMap& config,
const KVAxesPosition& kv_pos,
const std::filesystem::path& path = {});
const KVAxesPosition& kv_pos);

/// @brief SharedOptional is a wrapper around a reference to an existing object and an optional shared alternative value.
/// The difference from std::optional is that the default state is not empty and contains a reference to an existing object outside the class.
Expand Down
4 changes: 1 addition & 3 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
if (m_is_npu) {
embedder_device = "CPU";
utils::KVDesc kv_desc;
std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(
language_model, lm_properties, kv_pos, language_model_path
);
std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(language_model, lm_properties, kv_pos);
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
} else {
compiled_language_model = utils::singleton_core().compile_model(language_model, device, lm_properties);
Expand Down
Loading