[NPU] Refactoring Pipeline object (#29679)

pereanub · web-flow · commit e48c32305c02 · 2025-03-28T15:20:39.000Z
### Details: - *Refactoring Pipeline object* ### Ticket: - *CVS-164942* --------- Signed-off-by: Bogdan Pereanu <bogdan.pereanu@intel.com>
diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
@@ -13,7 +13,6 @@
 #include "intel_npu/utils/zero/zero_utils.hpp"
 #include "intel_npu/utils/zero/zero_wrappers.hpp"
 #include "zero_pipeline.hpp"
-#include "zero_profiling.hpp"
 #include "zero_remote_tensor.hpp"
 #include "zero_tensor.hpp"
 
@@ -37,7 +36,6 @@ class ZeroInferRequest final : public SyncInferRequest {
 
 private:
     std::vector<ov::ProfilingInfo> get_profiling_info() const override;
-    std::vector<uint8_t> get_raw_profiling_data() const;
 
     /**
      * @brief Check the received tensor and set the Level Zero tensor accordingly
@@ -88,9 +86,6 @@ class ZeroInferRequest final : public SyncInferRequest {
     std::shared_ptr<const zeroMemory::HostMemAllocator> _inputAllocator;
     std::shared_ptr<const zeroMemory::HostMemAllocator> _outputAllocator;
 
-    zeroProfiling::ProfilingPool _profilingPool;
-    zeroProfiling::ProfilingQuery _profilingQuery;
-    std::shared_ptr<zeroProfiling::NpuInferProfiling> _npuProfiling;
     std::unique_ptr<Pipeline> _pipeline;
 
     bool _pipelineIsCreated = false;
diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
@@ -18,9 +18,6 @@ struct Pipeline {
     Pipeline(const Config& config,
              const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
              const std::shared_ptr<IGraph>& graph,
-             zeroProfiling::ProfilingPool& profiling_pool,
-             zeroProfiling::ProfilingQuery& profiling_query,
-             const std::shared_ptr<zeroProfiling::NpuInferProfiling>& npu_profiling,
              const std::vector<std::vector<std::shared_ptr<ov::ITensor>>>& input_tensors,
              const std::vector<std::shared_ptr<ov::ITensor>>& output_tensors);
 
@@ -32,17 +29,19 @@ struct Pipeline {
     void pull();
     void reset() const;
 
-    void updateCommandList(uint32_t arg_index, const void* arg_data, size_t byte_size);
-    void updateCommandListIndex(uint32_t arg_index, const void* arg_data, size_t command_list_index);
+    void update_graph_arguments(uint32_t arg_index, const void* arg_data, size_t byte_size);
+    void update_graph_arguments_batching(uint32_t arg_index, const void* arg_data, size_t batch_index);
 
-    void closeCommandList();
-    void closeCommandListIndex(size_t command_list_index);
+    std::vector<ov::ProfilingInfo> get_profiling_info() const;
 
 protected:
     std::shared_ptr<IGraph> _graph;
     const Config _config;
     const uint32_t _id;
 
+    std::unique_ptr<zeroProfiling::ProfilingQuery> _profiling_query;
+    std::shared_ptr<zeroProfiling::NpuInferProfiling> _npu_profiling;
+
     /**
      * @brief Indicates how many command lists will be used inside the pipeline.
      * @details Leveraging multiple command lists implies distributing the input/output buffers accross the batch axis
@@ -59,7 +58,6 @@ struct Pipeline {
     std::shared_ptr<EventPool> _event_pool;
     std::vector<std::shared_ptr<Event>> _events;
     bool _sync_output_with_fences = true;
-    std::shared_ptr<zeroProfiling::NpuInferProfiling> _npu_profiling;
     Logger _logger;
 };
 
diff --git a/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp b/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp
@@ -50,7 +50,7 @@ struct ProfilingQuery {
           _index(index) {}
     ProfilingQuery(const ProfilingQuery&) = delete;
     ProfilingQuery& operator=(const ProfilingQuery&) = delete;
-    void create(const ze_graph_profiling_pool_handle_t& profiling_pool);
+    void create(const std::shared_ptr<ProfilingPool>& profiling_pool);
     ze_graph_profiling_query_handle_t getHandle() const {
         return _handle;
     }
@@ -67,6 +67,8 @@ struct ProfilingQuery {
     std::shared_ptr<ZeroInitStructsHolder> _init_structs;
     const uint32_t _index;
 
+    std::shared_ptr<ProfilingPool> _profiling_pool = nullptr;
+
     ze_graph_profiling_query_handle_t _handle = nullptr;
 };
 
diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -78,17 +78,9 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
       _graphInputDescriptors(_graph->get_input_descriptors()),
       _graphOutputDescriptors(_graph->get_output_descriptors()),
       _levelZeroInputTensors(_metadata.inputs.size(), std::vector<std::shared_ptr<ov::ITensor>>(1, nullptr)),
-      _levelZeroOutputTensors(_metadata.outputs.size(), nullptr),
-      _profilingPool(_initStructs, _graph, zeroProfiling::POOL_SIZE),
-      _profilingQuery(_initStructs, 0) {
+      _levelZeroOutputTensors(_metadata.outputs.size(), nullptr) {
     _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest");
 
-    auto proftype = config.get<PROFILING_TYPE>();
-    if (proftype == ov::intel_npu::ProfilingType::INFER) {
-        _logger.debug("ZeroInferRequest::ZeroInferRequest - profiling type == ov::intel_npu::ProfilingType::INFER");
-        _npuProfiling = std::make_shared<zeroProfiling::NpuInferProfiling>(_initStructs, _config.get<LOG_LEVEL>());
-    }
-
     _outputAllocator = std::make_shared<const zeroMemory::HostMemAllocator>(_initStructs);
     _inputAllocator =
         std::make_shared<const zeroMemory::HostMemAllocator>(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED);
@@ -195,14 +187,8 @@ void ZeroInferRequest::create_pipeline() {
     _logger.debug("ZeroInferRequest::create_pipeline - constructing pipeline");
 
     // Construct pipeline
-    _pipeline = std::make_unique<Pipeline>(_config,
-                                           _initStructs,
-                                           _graph,
-                                           _profilingPool,
-                                           _profilingQuery,
-                                           _npuProfiling,
-                                           _levelZeroInputTensors,
-                                           _levelZeroOutputTensors);
+    _pipeline =
+        std::make_unique<Pipeline>(_config, _initStructs, _graph, _levelZeroInputTensors, _levelZeroOutputTensors);
 
     _logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed");
 }
@@ -241,12 +227,11 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
 
         OPENVINO_ASSERT(levelZeroTensors->data(), "Empty buffer");
 
-        OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList");
-        _pipeline->updateCommandList(
+        OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "update_graph_arguments");
+        _pipeline->update_graph_arguments(
             isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx,
             levelZeroTensors->data(),
             levelZeroTensors->get_byte_size());
-        _pipeline->closeCommandList();
     }
 }
 
@@ -269,12 +254,11 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptr<ZeroRemoteTe
         auto data = tensor->get_original_memory();
         OPENVINO_ASSERT(data, "Empty buffer");
 
-        OV_ITT_TASK_NEXT(ZERO_SET_REMOTE_TENSOR, "updateCommandList");
-        _pipeline->updateCommandList(
+        OV_ITT_TASK_NEXT(ZERO_SET_REMOTE_TENSOR, "update_graph_arguments");
+        _pipeline->update_graph_arguments(
             isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx,
             data,
             tensor->get_byte_size());
-        _pipeline->closeCommandList();
     }
 }
 
@@ -384,12 +368,12 @@ void ZeroInferRequest::set_tensors(const ov::Output<const ov::Node>& port,
                 }
 
                 if (_pipelineIsCreated) {
-                    OV_ITT_TASK_NEXT(SET_TENSORS, "updateCommandList");
-
                     OPENVINO_ASSERT(data, "Empty buffer");
+                    OV_ITT_TASK_NEXT(SET_TENSORS, "updateCommandList");
 
-                    _pipeline->updateCommandListIndex(_graph->get_input_descriptors().at(foundPort.idx).idx, data, i);
-                    _pipeline->closeCommandListIndex(i);
+                    _pipeline->update_graph_arguments_batching(_graph->get_input_descriptors().at(foundPort.idx).idx,
+                                                               data,
+                                                               i);
                 }
             }
         }
@@ -442,7 +426,6 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
 }
 
 void ZeroInferRequest::update_pipeline_if_memory_changed() {
-    bool closePipeline = false;
     size_t ioIndex = 0;
 
     for (const auto& levelZeroTensor : _levelZeroInputTensors) {
@@ -459,10 +442,9 @@ void ZeroInferRequest::update_pipeline_if_memory_changed() {
             _logger.debug("Update input graph descriptor with the new tensor");
             OPENVINO_ASSERT(zeroTensor->data(), "Empty buffer");
 
-            _pipeline->updateCommandList(_graph->get_input_descriptors().at(ioIndex).idx,
-                                         zeroTensor->data(),
-                                         zeroTensor->get_byte_size());
-            closePipeline = true;
+            _pipeline->update_graph_arguments(_graph->get_input_descriptors().at(ioIndex).idx,
+                                              zeroTensor->data(),
+                                              zeroTensor->get_byte_size());
 
             if (!inputDescriptor.isStateInput) {
                 zeroTensor->reset_memory_flag();
@@ -487,25 +469,18 @@ void ZeroInferRequest::update_pipeline_if_memory_changed() {
             _logger.debug("Update output graph descriptor with the new tensor");
             OPENVINO_ASSERT(zeroTensor->data(), "Empty buffer");
 
-            _pipeline->updateCommandList(_graph->get_output_descriptors().at(ioIndex).idx,
-                                         zeroTensor->data(),
-                                         zeroTensor->get_byte_size());
-            closePipeline = true;
+            _pipeline->update_graph_arguments(_graph->get_output_descriptors().at(ioIndex).idx,
+                                              zeroTensor->data(),
+                                              zeroTensor->get_byte_size());
 
             zeroTensor->reset_memory_flag();
         }
 
         ++ioIndex;
     }
-
-    if (closePipeline) {
-        _pipeline->closeCommandList();
-    }
 }
 
 void ZeroInferRequest::update_states_if_memory_changed() {
-    bool closePipeline = false;
-
     for (const auto& variableState : _variableStates) {
         auto zeroState = std::dynamic_pointer_cast<ZeroVariableState>(variableState._ptr);
 
@@ -522,27 +497,21 @@ void ZeroInferRequest::update_states_if_memory_changed() {
 
                 void* userBuffer = !remoteTensor ? zeroState->get_state()->data() : remoteTensor->get_original_memory();
 
-                _pipeline->updateCommandList(_graphInputDescriptors.at(zeroState->get_tensor_index()).idx,
-                                             userBuffer,
-                                             zeroState->get_state()->get_byte_size());
+                _pipeline->update_graph_arguments(_graphInputDescriptors.at(zeroState->get_tensor_index()).idx,
+                                                  userBuffer,
+                                                  zeroState->get_state()->get_byte_size());
 
-                _pipeline->updateCommandList(_graphOutputDescriptors.at(zeroState->get_related_tensor_index()).idx,
-                                             userBuffer,
-                                             zeroState->get_state()->get_byte_size());
+                _pipeline->update_graph_arguments(_graphOutputDescriptors.at(zeroState->get_related_tensor_index()).idx,
+                                                  userBuffer,
+                                                  zeroState->get_state()->get_byte_size());
 
                 zeroState->reset_zero_tensor_updated_flag();
 
                 get_level_zero_input(zeroState->get_tensor_index()) = zeroState->get_state()._ptr;
                 _levelZeroOutputTensors.at(zeroState->get_related_tensor_index()) = zeroState->get_state()._ptr;
-
-                closePipeline = true;
             }
         }
     }
-
-    if (closePipeline) {
-        _pipeline->closeCommandList();
-    }
 }
 
 void ZeroInferRequest::infer() {
@@ -749,31 +718,9 @@ void ZeroInferRequest::check_network_precision(const ov::element::Type_t precisi
 }
 
 std::vector<ov::ProfilingInfo> ZeroInferRequest::get_profiling_info() const {
-    _logger.debug("InferRequest::get_profiling_info started");
-    const auto& compiledModel = *std::dynamic_pointer_cast<const ICompiledModel>(_compiledModel);
-    const auto& compilerConfig = compiledModel.get_config();
-    if (!compilerConfig.get<PERF_COUNT>() || !_config.get<PERF_COUNT>()) {
-        _logger.warning("InferRequest::get_profiling_info complete with empty {}.");
-        return {};
-    }
+    OPENVINO_ASSERT(_pipeline, "Profiling information isn't available before running an inference!");
 
-    auto compilerType = compilerConfig.get<COMPILER_TYPE>();
-    if (compilerType == ov::intel_npu::CompilerType::MLIR) {
-        // For plugin compiler retreive raw profiling data from backend and delegate
-        // processing to the compiler
-        auto profData = get_raw_profiling_data();
-        _logger.debug("InferRequest::get_profiling_info complete with compiler->process_profiling_output().");
-        return _graph->process_profiling_output(profData, compilerConfig);
-    } else {
-        auto proftype = _config.get<PROFILING_TYPE>();
-        if (proftype == ov::intel_npu::ProfilingType::INFER) {
-            _logger.debug("InferRequest::get_profiling_info complete with _npuProfiling->getNpuInferStatistics().");
-            return _npuProfiling->getNpuInferStatistics();
-        } else {  /// proftype = MODEL or undefined = fallback to model profiling
-            _logger.debug("InferRequest::get_profiling_info complete with _profilingQuery.getLayerStatistics().");
-            return _profilingQuery.getLayerStatistics();
-        }
-    }
+    return _pipeline->get_profiling_info();
 }
 
 std::shared_ptr<ov::ITensor> ZeroInferRequest::create_tensor(ov::element::Type type,
@@ -797,10 +744,6 @@ void ZeroInferRequest::add_state(const IODescriptor& descriptor, size_t tensorIn
                                                                   _config));
 }
 
-std::vector<uint8_t> ZeroInferRequest::get_raw_profiling_data() const {
-    return _profilingQuery.getData<uint8_t>();
-}
-
 std::shared_ptr<ov::ITensor>& ZeroInferRequest::get_level_zero_input(size_t index, size_t tensorNo) const {
     return _levelZeroInputTensors.at(index).at(tensorNo);
 }
diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
diff --git a/src/plugins/intel_npu/src/backend/src/zero_profiling.cpp b/src/plugins/intel_npu/src/backend/src/zero_profiling.cpp