[GPU] Common shape info buffer. phase 1 (openvinotoolkit#28167)

dnkurek · Vladimir Paramuzov · web-flow · commit d63a9df0a24b · 2025-01-14T12:14:04.000Z
### Details:
 - Reduce memory usage
 - common buffer for shape info as the first step

### Tickets:
 - *ticket-id*

---------

Co-authored-by: Vladimir Paramuzov &lt;vladimir.paramuzov@intel.com&gt;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@@ -160,6 +160,7 @@ struct network {
     std::map<primitive_id, network_output> execute(const std::vector<event::ptr>& dependencies = {});
 
     void validate_primitives();
+    void preallocate_shape_info_buffers();
     void set_arguments();
     // Implementation specific calls
     bool does_node_need_lockable_output(const primitive_id& id) const;
@@ -220,6 +221,9 @@ struct network {
     bool _reset_arguments;
     bool _reuse_variable_mem = false;
 
+    /* Common memory pointer for shape_info */
+    memory::ptr _shape_info_ptr;
+
     std::unordered_map<primitive_id, std::shared_ptr<primitive_inst>> _primitives;
     std::vector<shared_mem_type> _in_out_shared_mem_types;
     std::vector<std::shared_ptr<primitive_inst>> _inputs;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
@@ -57,6 +57,9 @@ class engine {
     /// Created memory object from memory @p params and reinterpred the data using specified @p layout
     virtual memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) = 0;
 
+    /// Created subbuffer memory object from the other @p memory and reinterpred the data using specified @p new_layout
+    virtual memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) = 0;
+
     /// Created memory object from the other @p memory and reinterpred the data using specified @p new_layout
     virtual memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) = 0;
 
diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -232,6 +232,7 @@ class primitive_inst {
     }
 
     memory::ptr shape_info_memory_ptr() const { return _shape_info_memory; }
+    void set_shape_info_memory_subbuffer(memory::ptr addr);
 
     void add_dep_events(const std::vector<event::ptr>& events);
     void add_dep_event(event::ptr ev);
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -207,6 +207,7 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
     build_insts_deps();
     build_exec_order();
     validate_primitives();
+    preallocate_shape_info_buffers();
     add_default_output_chains();
 }
 
@@ -275,6 +276,39 @@ void network::validate_primitives() {
     }
 }
 
+void network::preallocate_shape_info_buffers() {
+    GPU_DEBUG_DEFINE_MEM_LOGGER("preallocate_shape_info_buffers");
+    int64_t sum = 0;
+
+    /* Use 512 byte alignment for performance */
+    const int alignment = 512;
+
+    for (auto const& prim : _exec_order) {
+        auto& node = prim->get_node();
+        int64_t shape_elements = align_to(node.get_total_shape_info_size(), alignment);
+        sum += shape_elements;
+    }
+
+    if (sum == 0)
+        return;
+
+    auto& engine = get_engine();
+    _shape_info_ptr = engine.allocate_memory(layout{{sum}, data_types::i32, format::bfyx}, false);
+    size_t offset = 0;
+    for (auto const& prim : _exec_order) {
+        auto& node = prim->get_node();
+        const int64_t shape_elements = node.get_total_shape_info_size();
+
+        if (shape_elements == 0)
+            continue;
+
+        auto new_mem = engine.create_subbuffer(*_shape_info_ptr, layout{{shape_elements}, data_types::i32, format::bfyx}, offset);
+        prim->set_shape_info_memory_subbuffer(new_mem);
+
+        offset += align_to(shape_elements, alignment) * sizeof(int32_t);
+    }
+}
+
 void network::set_arguments() {
     if (!_reset_arguments)
         return;
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -1146,6 +1146,10 @@ void primitive_inst::fill_shape_info_data(const layout& runtime_layout, const la
     }
 }
 
+void primitive_inst::set_shape_info_memory_subbuffer(memory::ptr addr) {
+    _shape_info_memory = addr;
+}
+
 void primitive_inst::allocate_shape_info_memory() {
     int64_t shape_elements = _node->get_total_shape_info_size();
     _shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}, false);
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@@ -207,7 +207,36 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
         }
     }
 }
+memory::ptr ocl_engine::create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) {
+    OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to create a subbuffer from a buffer allocated by a different engine");
+    try {
+        if (new_layout.format.is_image_2d()) {
+            OPENVINO_NOT_IMPLEMENTED;
+        } else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
+            auto& new_buf = reinterpret_cast<const ocl::gpu_usm&>(memory);
+            auto ptr = new_buf.get_buffer().get();
+            auto sub_buffer = cl::UsmMemory(get_usm_helper(), ptr, byte_offset);
+
+            return std::make_shared<ocl::gpu_usm>(this,
+                                     new_layout,
+                                     sub_buffer,
+                                     memory.get_allocation_type(),
+                                     memory.get_mem_tracker());
+        } else {
+            auto buffer = reinterpret_cast<const ocl::gpu_buffer&>(memory).get_buffer();
+            cl_buffer_region sub_buffer_region = { byte_offset, new_layout.get_linear_size() };
+            auto sub_buffer = buffer.createSubBuffer(CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,
+                            CL_BUFFER_CREATE_TYPE_REGION, &sub_buffer_region);
 
+            return std::make_shared<ocl::gpu_buffer>(this,
+                                     new_layout,
+                                     sub_buffer,
+                                     memory.get_mem_tracker());
+        }
+    } catch (cl::Error const& err) {
+        OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
+    }
+}
 memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) {
     OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine");
     OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(),
@@ -221,7 +250,7 @@ memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& n
                                      reinterpret_cast<const ocl::gpu_image2d&>(memory).get_buffer(),
                                      memory.get_mem_tracker());
         } else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
-            return std::make_shared<ocl::gpu_usm>(this,
+           return std::make_shared<ocl::gpu_usm>(this,
                                      new_layout,
                                      reinterpret_cast<const ocl::gpu_usm&>(memory).get_buffer(),
                                      memory.get_allocation_type(),
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
@@ -26,6 +26,7 @@ class ocl_engine : public engine {
 
     memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override;
     memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override;
+    memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override;
     memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override;
     bool is_the_same_buffer(const memory& mem1, const memory& mem2) override;
     bool check_allocatable(const layout& layout, allocation_type type) override;
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp
@@ -928,9 +928,9 @@ class UsmHolder {
 class UsmMemory {
 public:
     explicit UsmMemory(const cl::UsmHelper& usmHelper) : _usmHelper(usmHelper) { }
-    UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr)
+    UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr, size_t offset = 0)
     : _usmHelper(usmHelper)
-    , _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, usm_ptr, true)) {
+    , _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, reinterpret_cast<uint8_t*>(usm_ptr) + offset, true)) {
         if (!usm_ptr) {
             throw std::runtime_error("[GPU] Can't share null usm pointer");
         }

Original file line number	Diff line number	Diff line change
`@@ -232,6 +232,7 @@ class primitive_inst {`
`232`	`232`	`}`
`233`	`233`
`234`	`234`	`memory::ptr shape_info_memory_ptr() const { return _shape_info_memory; }`
	`235`	`+ void set_shape_info_memory_subbuffer(memory::ptr addr);`
`235`	`236`
`236`	`237`	`void add_dep_events(const std::vector<event::ptr>& events);`
`237`	`238`	`void add_dep_event(event::ptr ev);`
Original file line number	Diff line number	Diff line change
`@@ -1146,6 +1146,10 @@ void primitive_inst::fill_shape_info_data(const layout& runtime_layout, const la`
`1146`	`1146`	`}`
`1147`	`1147`	`}`
`1148`	`1148`
	`1149`	`+void primitive_inst::set_shape_info_memory_subbuffer(memory::ptr addr) {`
	`1150`	`+ _shape_info_memory = addr;`
	`1151`	`+}`
	`1152`	`+`
`1149`	`1153`	`void primitive_inst::allocate_shape_info_memory() {`
`1150`	`1154`	`int64_t shape_elements = _node->get_total_shape_info_size();`
`1151`	`1155`	`_shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}, false);`