Skip to content

Commit d63a9df

Browse files
dnkurekVladimir Paramuzov
and
Vladimir Paramuzov
authored
[GPU] Common shape info buffer. phase 1 (openvinotoolkit#28167)
### Details: - Reduce memory usage - common buffer for shape info as the first step ### Tickets: - *ticket-id* --------- Co-authored-by: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
1 parent 8e838ec commit d63a9df

File tree

8 files changed

+79
-3
lines changed

8 files changed

+79
-3
lines changed

src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp

+4
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ struct network {
160160
std::map<primitive_id, network_output> execute(const std::vector<event::ptr>& dependencies = {});
161161

162162
void validate_primitives();
163+
void preallocate_shape_info_buffers();
163164
void set_arguments();
164165
// Implementation specific calls
165166
bool does_node_need_lockable_output(const primitive_id& id) const;
@@ -220,6 +221,9 @@ struct network {
220221
bool _reset_arguments;
221222
bool _reuse_variable_mem = false;
222223

224+
/* Common memory pointer for shape_info */
225+
memory::ptr _shape_info_ptr;
226+
223227
std::unordered_map<primitive_id, std::shared_ptr<primitive_inst>> _primitives;
224228
std::vector<shared_mem_type> _in_out_shared_mem_types;
225229
std::vector<std::shared_ptr<primitive_inst>> _inputs;

src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ class engine {
5757
/// Created memory object from memory @p params and reinterpred the data using specified @p layout
5858
virtual memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) = 0;
5959

60+
/// Created subbuffer memory object from the other @p memory and reinterpred the data using specified @p new_layout
61+
virtual memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) = 0;
62+
6063
/// Created memory object from the other @p memory and reinterpred the data using specified @p new_layout
6164
virtual memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) = 0;
6265

src/plugins/intel_gpu/src/graph/include/primitive_inst.h

+1
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ class primitive_inst {
232232
}
233233

234234
memory::ptr shape_info_memory_ptr() const { return _shape_info_memory; }
235+
void set_shape_info_memory_subbuffer(memory::ptr addr);
235236

236237
void add_dep_events(const std::vector<event::ptr>& events);
237238
void add_dep_event(event::ptr ev);

src/plugins/intel_gpu/src/graph/network.cpp

+34
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
207207
build_insts_deps();
208208
build_exec_order();
209209
validate_primitives();
210+
preallocate_shape_info_buffers();
210211
add_default_output_chains();
211212
}
212213

@@ -275,6 +276,39 @@ void network::validate_primitives() {
275276
}
276277
}
277278

279+
void network::preallocate_shape_info_buffers() {
280+
GPU_DEBUG_DEFINE_MEM_LOGGER("preallocate_shape_info_buffers");
281+
int64_t sum = 0;
282+
283+
/* Use 512 byte alignment for performance */
284+
const int alignment = 512;
285+
286+
for (auto const& prim : _exec_order) {
287+
auto& node = prim->get_node();
288+
int64_t shape_elements = align_to(node.get_total_shape_info_size(), alignment);
289+
sum += shape_elements;
290+
}
291+
292+
if (sum == 0)
293+
return;
294+
295+
auto& engine = get_engine();
296+
_shape_info_ptr = engine.allocate_memory(layout{{sum}, data_types::i32, format::bfyx}, false);
297+
size_t offset = 0;
298+
for (auto const& prim : _exec_order) {
299+
auto& node = prim->get_node();
300+
const int64_t shape_elements = node.get_total_shape_info_size();
301+
302+
if (shape_elements == 0)
303+
continue;
304+
305+
auto new_mem = engine.create_subbuffer(*_shape_info_ptr, layout{{shape_elements}, data_types::i32, format::bfyx}, offset);
306+
prim->set_shape_info_memory_subbuffer(new_mem);
307+
308+
offset += align_to(shape_elements, alignment) * sizeof(int32_t);
309+
}
310+
}
311+
278312
void network::set_arguments() {
279313
if (!_reset_arguments)
280314
return;

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -1146,6 +1146,10 @@ void primitive_inst::fill_shape_info_data(const layout& runtime_layout, const la
11461146
}
11471147
}
11481148

1149+
void primitive_inst::set_shape_info_memory_subbuffer(memory::ptr addr) {
1150+
_shape_info_memory = addr;
1151+
}
1152+
11491153
void primitive_inst::allocate_shape_info_memory() {
11501154
int64_t shape_elements = _node->get_total_shape_info_size();
11511155
_shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}, false);

src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp

+30-1
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,36 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
207207
}
208208
}
209209
}
210+
memory::ptr ocl_engine::create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) {
211+
OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to create a subbuffer from a buffer allocated by a different engine");
212+
try {
213+
if (new_layout.format.is_image_2d()) {
214+
OPENVINO_NOT_IMPLEMENTED;
215+
} else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
216+
auto& new_buf = reinterpret_cast<const ocl::gpu_usm&>(memory);
217+
auto ptr = new_buf.get_buffer().get();
218+
auto sub_buffer = cl::UsmMemory(get_usm_helper(), ptr, byte_offset);
219+
220+
return std::make_shared<ocl::gpu_usm>(this,
221+
new_layout,
222+
sub_buffer,
223+
memory.get_allocation_type(),
224+
memory.get_mem_tracker());
225+
} else {
226+
auto buffer = reinterpret_cast<const ocl::gpu_buffer&>(memory).get_buffer();
227+
cl_buffer_region sub_buffer_region = { byte_offset, new_layout.get_linear_size() };
228+
auto sub_buffer = buffer.createSubBuffer(CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,
229+
CL_BUFFER_CREATE_TYPE_REGION, &sub_buffer_region);
210230

231+
return std::make_shared<ocl::gpu_buffer>(this,
232+
new_layout,
233+
sub_buffer,
234+
memory.get_mem_tracker());
235+
}
236+
} catch (cl::Error const& err) {
237+
OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
238+
}
239+
}
211240
memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) {
212241
OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine");
213242
OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(),
@@ -221,7 +250,7 @@ memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& n
221250
reinterpret_cast<const ocl::gpu_image2d&>(memory).get_buffer(),
222251
memory.get_mem_tracker());
223252
} else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
224-
return std::make_shared<ocl::gpu_usm>(this,
253+
return std::make_shared<ocl::gpu_usm>(this,
225254
new_layout,
226255
reinterpret_cast<const ocl::gpu_usm&>(memory).get_buffer(),
227256
memory.get_allocation_type(),

src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class ocl_engine : public engine {
2626

2727
memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override;
2828
memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override;
29+
memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override;
2930
memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override;
3031
bool is_the_same_buffer(const memory& mem1, const memory& mem2) override;
3132
bool check_allocatable(const layout& layout, allocation_type type) override;

src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -928,9 +928,9 @@ class UsmHolder {
928928
class UsmMemory {
929929
public:
930930
explicit UsmMemory(const cl::UsmHelper& usmHelper) : _usmHelper(usmHelper) { }
931-
UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr)
931+
UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr, size_t offset = 0)
932932
: _usmHelper(usmHelper)
933-
, _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, usm_ptr, true)) {
933+
, _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, reinterpret_cast<uint8_t*>(usm_ptr) + offset, true)) {
934934
if (!usm_ptr) {
935935
throw std::runtime_error("[GPU] Can't share null usm pointer");
936936
}

0 commit comments

Comments
 (0)