CB: drop profiling as it drops performance (openvinotoolkit#1280)

ilya-lavrenov · web-flow · commit 6f160e0ca8fd · 2024-11-30T17:33:50.000Z
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -21,7 +21,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
 
     ov::Core core;
 
-    auto [core_properties, compile_properties] = utils::split_core_complile_config(properties);
+    auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
     core.set_property(core_properties);
 
     // The model can be compiled for GPU as well
@@ -57,7 +57,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
     }
 
     SchedulerConfig updated_config = scheduler_config;
-    // update KV number in scheduler config
+    // update KV blocks number in scheduler config
     if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) {
         updated_config.num_kv_blocks = device_config.get_num_kv_blocks();
     }
@@ -166,24 +166,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
         timer.start();
         logits = m_model_runner->forward(m_requests, scheduler_output);
         timer.end();
-
-        ov::InferRequest infer_request = m_model_runner->get_infer_request();
-        ov::CompiledModel compiled_model = infer_request.get_compiled_model();
-        const bool is_profiling_enabled = compiled_model.get_property(ov::enable_profiling);
-
-        // collect detailed statistic
-        if (is_profiling_enabled) {
-            std::vector<ov::ProfilingInfo> profiling_info = m_model_runner->get_infer_request().get_profiling_info();
-            for (const ov::ProfilingInfo& info : profiling_info) {
-                double current_time = info.real_time.count();
-                if (info.node_type == "PagedAttentionExtension") {
-                    m_perf.m_paged_attention_time_ms += current_time;
-                } else if (info.node_type == "FullyConnected") {
-                    m_perf.m_matmul_time_ms += current_time;
-                }
-                m_perf.m_infer_total_ms += current_time;
-            }
-        }
     }
 
 #ifdef DEBUG_CACHE_STATE_DUMP
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -63,15 +63,15 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
     {
         ov::Core core;
         if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
-            auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config);
+            auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(*filtered_plugin_config);
             core.set_property(core_plugin_config);
             auto model = core.read_model(models_path / "openvino_model.xml");
             m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
             m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
             utils::slice_matmul_statefull_model(model);
             m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
         } else {
-            auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config);
+            auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(plugin_config);
             core.set_property(core_plugin_config);
             auto model = core.read_model(models_path / "openvino_model.xml");
             utils::slice_matmul_statefull_model(model);
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -31,7 +31,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
     const ov::genai::ModelDesc draft_model_desc,
     const ov::AnyMap& tokenizer_properties) {
     ov::Core core;
-    auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_properties);
+    auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(main_properties);
     core.set_property(core_properties);
 
     std::filesystem::path openvino_model_name = "openvino_model.xml",
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -203,7 +203,7 @@ ProcessorConfig from_any_map(
  * There are not supported by `core.compile` function plugin options like `ENABLE_MMAP`
  * Move this options to `core.set_property` config
  */
-std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& properties) {
+std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& properties) {
     const std::vector<std::string> unsupported_by_compile_properties{"ENABLE_MMAP"};
     ov::AnyMap core_properties;
     ov::AnyMap compile_properties{properties};
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -78,7 +78,7 @@ ProcessorConfig from_any_map(
     const ProcessorConfig& initial
 );
 
-std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& properties);
+std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& properties);
 
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
 
diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp
@@ -53,7 +53,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
                                 const ov::AnyMap& properties)
         : WhisperPipelineImplBase{models_path} {
         ov::Core core = utils::singleton_core();
-        auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(properties);
+        auto [core_properties, compile_properties] = ov::genai::utils::split_core_compile_config(properties);
         core.set_property(core_properties);
 
         m_models.encoder =