Skip to content

Commit 22a583e

Browse files
committed
[GPU] Update internal buffers interface, add BufferDescriptor struct for internal buffers description
1 parent 3cedf6c commit 22a583e

10 files changed

+54
-64
lines changed

src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class wait_for_events_impl : public primitive_impl {
3232
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
3333
void set_arguments(primitive_inst& /*instance*/) override {}
3434
void set_arguments(primitive_inst& /*instance*/, kernel_arguments_data& /*args*/) override {}
35-
std::vector<layout> get_internal_buffer_layouts() const override { return {}; }
35+
std::vector<BufferDescriptor> get_internal_buffer_descs(const kernel_impl_params&) const override { return {}; }
3636

3737
event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) override {
3838
auto& stream = instance.get_network().get_stream();

src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -150,17 +150,17 @@ struct border_impl : typed_primitive_impl_ocl<border> {
150150
return args;
151151
}
152152

153-
std::vector<layout> get_internal_buffer_layouts_impl() const override {
153+
std::vector<BufferDescriptor> get_internal_buffer_descs_impl(const kernel_impl_params&) const override {
154154
const auto& prim_params = static_cast<const kernel_selector::border_params&>(*_kernel_data.params);
155-
std::vector<layout> layouts;
155+
std::vector<BufferDescriptor> internal_buffers;
156156

157157
if ((_kernel_data.params == nullptr && zero_input) ||
158158
(_kernel_data.params != nullptr && prim_params.inputs[0].LogicalSize() == 0)) {
159159
layout any_layout = {data_types::u8, format::bfyx, {1, 1, 1, 1}};
160-
layouts.push_back(any_layout);
160+
internal_buffers.emplace_back(any_layout);
161161
}
162162

163-
return layouts;
163+
return internal_buffers;
164164
}
165165
};
166166

src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -157,8 +157,8 @@ struct multi_stage_primitive : public typed_primitive_impl<PType> {
157157
return _kernels;
158158
}
159159

160-
std::vector<layout> get_internal_buffer_layouts_impl() const override {
161-
std::vector<layout> layouts;
160+
std::vector<BufferDescriptor> get_internal_buffer_descs_impl(const kernel_impl_params&) const override {
161+
std::vector<BufferDescriptor> internal_buffers;
162162
for (auto& kd : _kernels_data) {
163163
if (kd.internalBuffers.empty())
164164
continue;
@@ -168,10 +168,10 @@ struct multi_stage_primitive : public typed_primitive_impl<PType> {
168168
for (const auto& buffer : kd.internalBuffers) {
169169
layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel)
170170
{1, 1, 1, (tensor::value_type)(buffer.byte_count / bpp)}};
171-
layouts.push_back(inbuf_layout);
171+
internal_buffers.emplace_back(inbuf_layout);
172172
}
173173
}
174-
return layouts;
174+
return internal_buffers;
175175
}
176176

177177
void set_arguments_impl(typed_primitive_inst<PType>& instance) override {

src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp

+8-25
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
141141
ob << make_data(&use_micro_sdpa, sizeof(bool));
142142
}
143143

144-
std::vector<kernel_selector::InternalBuffer> get_internal_buffers_desc() const {
144+
std::vector<BufferDescriptor> get_internal_buffer_descs_impl(const kernel_impl_params&) const override {
145145
/*
146146
* Internal buffers allocation owners and users:
147147
* +--------------------------------------+--------------------+--------------------+
@@ -181,12 +181,16 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
181181
* Filled in paged_attention_inst::on_execute() call for sdpa-micro kernel only.
182182
*/
183183

184-
auto add_internal_buffers = [](std::vector<kernel_selector::InternalBuffer>& internal_buffers,
184+
auto add_internal_buffers = [](std::vector<BufferDescriptor>& internal_buffers,
185185
const kernel_selector::KernelData& kd) {
186-
internal_buffers.insert(internal_buffers.end(), kd.internalBuffers.begin(), kd.internalBuffers.end());
186+
for (const auto& buffer_desc : kd.internalBuffers) {
187+
internal_buffers.emplace_back(
188+
layout{ov::PartialShape{static_cast<int64_t>(buffer_desc.byte_count)}, ov::element::u8, format::bfyx},
189+
buffer_desc.lockable);
190+
}
187191
};
188192

189-
std::vector<kernel_selector::InternalBuffer> internal_buffers;
193+
std::vector<BufferDescriptor> internal_buffers;
190194
add_internal_buffers(internal_buffers, _kernels_data[Stage::KV_CACHE_UPDATE]);
191195
add_internal_buffers(internal_buffers, _kernels_data[Stage::PA_SDPA]);
192196

@@ -196,15 +200,6 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
196200
return internal_buffers;
197201
}
198202

199-
std::vector<layout> get_internal_buffer_layouts_impl() const override {
200-
std::vector<layout> layouts;
201-
202-
for (const auto& buffer : get_internal_buffers_desc())
203-
layouts.emplace_back(ov::PartialShape{static_cast<int64_t>(buffer.byte_count)}, ov::element::u8, format::bfyx);
204-
205-
return layouts;
206-
}
207-
208203
kernel_arguments_data get_arguments(const paged_attention_inst& instance, size_t stage, size_t kernel_idx, bool is_mixed_mode) const {
209204
const auto desc = instance.get_node().as<paged_attention>().get_primitive();
210205

@@ -298,18 +293,6 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
298293
return args;
299294
}
300295

301-
std::set<size_t> get_lockable_internal_buffers() const override {
302-
std::set<size_t> lockable_ids;
303-
const auto& internal_buffers = get_internal_buffers_desc();
304-
for (size_t i = 0; i < internal_buffers.size(); i++) {
305-
if (internal_buffers[i].lockable) {
306-
lockable_ids.insert(i);
307-
}
308-
}
309-
310-
return lockable_ids;
311-
};
312-
313296
void prepare_internal_buffers(paged_attention_inst& instance, const PagedAttentionStage& stage) {
314297
const auto& desc = instance.get_impl_params()->typed_desc<paged_attention>();
315298
const bool has_scores_output = desc->has_scores_output();

src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -184,19 +184,19 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
184184
return _kernels;
185185
}
186186

187-
std::vector<layout> get_internal_buffer_layouts_impl() const override {
187+
std::vector<BufferDescriptor> get_internal_buffer_descs_impl(const kernel_impl_params&) const override {
188188
if (_kernel_data.internalBuffers.empty())
189189
return {};
190190

191-
std::vector<layout> layouts;
191+
std::vector<BufferDescriptor> internal_buffers;
192192
auto dtype = from_data_type(_kernel_data.internalBufferDataType);
193193
const auto bpp = data_type_traits::size_of(dtype);
194194
for (const auto& buffer : _kernel_data.internalBuffers) {
195195
layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel)
196196
{1, 1, 1, (tensor::value_type)(buffer.byte_count / bpp)}};
197-
layouts.push_back(inbuf_layout);
197+
internal_buffers.emplace_back(inbuf_layout);
198198
}
199-
return layouts;
199+
return internal_buffers;
200200
}
201201

202202
void set_arguments_impl(typed_primitive_inst<PType>& instance) override {

src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
5656
}
5757

5858
protected:
59-
std::vector<layout> get_internal_buffer_layouts_impl() const override {
59+
std::vector<BufferDescriptor> get_internal_buffer_descs_impl(const kernel_impl_params&) const override {
6060
// Look for the first sdpa_opt kernel entry. Currently, it can be used as default sdpa, indirect sdpa, or for both default
6161
// and indirect cases. All of sdpa_opt kernels use the same internal buffers, so we can find the first sdpa_opt and
6262
// use its` internal buffers configuration. The following scenarios are possible:
@@ -77,18 +77,18 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
7777
kernel_idx = 1;
7878
}
7979

80-
std::vector<layout> layouts;
80+
std::vector<BufferDescriptor> internal_buffers;
8181
if (kernel_idx < _kernels_data.size()) {
8282
auto dtype = from_data_type(_kernels_data[kernel_idx].internalBufferDataType);
8383
const auto bpp = data_type_traits::size_of(dtype);
8484
for (const auto& buffer : _kernels_data[kernel_idx].internalBuffers) {
8585
layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel)
8686
{1, 1, 1, (tensor::value_type)(buffer.byte_count / bpp)}};
87-
layouts.push_back(inbuf_layout);
87+
internal_buffers.emplace_back(inbuf_layout);
8888
}
8989
}
9090

91-
return layouts;
91+
return internal_buffers;
9292
}
9393

9494
static size_t get_beam_table_id(std::shared_ptr<const scaled_dot_product_attention> primitive) {

src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -572,10 +572,12 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
572572
return event;
573573
}
574574

575-
std::vector<layout> get_internal_buffer_layouts_impl() const override {
575+
std::vector<BufferDescriptor> get_internal_buffer_descs_impl(const kernel_impl_params&) const override {
576576
if (_scratchpad_md.get_size() == 0)
577577
return {};
578-
return {{{1, 1, 1, (tensor::value_type)(_scratchpad_md.get_size())}, cldnn::data_types::u8, format::bfyx}};
578+
return {BufferDescriptor({ov::PartialShape{static_cast<int64_t>(_scratchpad_md.get_size())},
579+
cldnn::data_types::u8,
580+
format::bfyx})};
579581
}
580582
};
581583

src/plugins/intel_gpu/src/graph/impls/sycl/primitive_sycl_base.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ struct typed_primitive_sycl_impl : public typed_primitive_impl<PType> {
5151
}
5252
}
5353

54-
std::vector<layout> get_internal_buffer_layouts_impl() const override {
54+
std::vector<BufferDescriptor> get_internal_buffer_descs_impl(const kernel_impl_params&) const override {
5555
return {};
5656
}
5757
};

src/plugins/intel_gpu/src/graph/include/primitive_inst.h

+11-5
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ template <class PType>
4242
class typed_primitive_inst;
4343

4444
struct ImplementationManager;
45+
struct BufferDescriptor;
4546

4647
/*
4748
Base class for all implementations.
@@ -55,8 +56,7 @@ struct primitive_impl {
5556
primitive_impl(nullptr, std::move(kernel_name), is_dynamic) {}
5657
virtual ~primitive_impl() = default;
5758

58-
virtual std::vector<layout> get_internal_buffer_layouts() const = 0;
59-
virtual std::set<size_t> get_lockable_internal_buffers() const { return {}; }
59+
virtual std::vector<BufferDescriptor> get_internal_buffer_descs(const kernel_impl_params& impl_params) const = 0;
6060
virtual void set_node_params(const program_node&) {}
6161
virtual const std::string& get_type_info() const = 0;
6262
virtual void set_arguments(primitive_inst& instance) = 0;
@@ -149,6 +149,12 @@ struct ImplementationsFactory {
149149
bool has(impl_types impl_type) const;
150150
};
151151

152+
struct BufferDescriptor {
153+
explicit BufferDescriptor(const layout& l, bool lockable = false) : m_lockable(lockable), m_layout(l) {}
154+
bool m_lockable = false;
155+
layout m_layout;
156+
};
157+
152158
/*
153159
Base class for all primitive instances.
154160
It's main responsibility is to allocate memory required to run single, specified in ctor,
@@ -516,11 +522,11 @@ struct typed_primitive_impl : public primitive_impl {
516522
return execute_impl(event, reinterpret_cast<typed_primitive_inst<PType>&>(instance));
517523
}
518524

519-
std::vector<layout> get_internal_buffer_layouts() const override {
520-
return get_internal_buffer_layouts_impl();
525+
std::vector<BufferDescriptor> get_internal_buffer_descs(const kernel_impl_params& impl_params) const override {
526+
return get_internal_buffer_descs_impl(impl_params);
521527
}
522528

523-
virtual std::vector<layout> get_internal_buffer_layouts_impl() const {
529+
virtual std::vector<BufferDescriptor> get_internal_buffer_descs_impl(const kernel_impl_params& impl_params) const {
524530
return {};
525531
}
526532

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

+13-14
Original file line numberDiff line numberDiff line change
@@ -1046,34 +1046,33 @@ void primitive_inst::realloc_if_needed(bool prev_execution_skipped) {
10461046
{
10471047
if (_impl == nullptr)
10481048
return;
1049-
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
1050-
if (ibuf_layouts.empty())
1049+
const auto& buffer_descs = _impl->get_internal_buffer_descs(*_impl_params);
1050+
if (buffer_descs.empty())
10511051
return;
10521052
GPU_DEBUG_CODE(std::string memalloc_info = "");
1053-
const auto& lockable_buffers_indexes = _impl->get_lockable_internal_buffers();
1054-
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
1055-
auto need_lockable = lockable_buffers_indexes.find(i) != lockable_buffers_indexes.end();
1053+
for (size_t i = 0; i < buffer_descs.size(); ++i) {
1054+
auto need_lockable = buffer_descs[i].m_lockable;
10561055
auto alloc_type = i < _intermediates_memory.size() ? _intermediates_memory[i]->get_allocation_type()
10571056
: allocation_type::unknown;
10581057
bool can_reuse = true;
10591058
can_reuse &= alloc_type != allocation_type::unknown &&
1060-
ibuf_layouts[i].bytes_count() <= max_intermediates_memory_sizes[i];
1059+
buffer_descs[i].m_layout.bytes_count() <= max_intermediates_memory_sizes[i];
10611060
can_reuse &= (need_lockable && alloc_type != cldnn::allocation_type::usm_device) ||
10621061
(!need_lockable && alloc_type != cldnn::allocation_type::usm_host);
10631062

10641063
if (can_reuse) {
1065-
_intermediates_memory[i] = _network.get_engine().reinterpret_buffer(*_intermediates_memory[i], ibuf_layouts[i]);
1064+
_intermediates_memory[i] = _network.get_engine().reinterpret_buffer(*_intermediates_memory[i], buffer_descs[i].m_layout);
10661065
GPU_DEBUG_CODE(memalloc_info += ((_intermediates_memory.size() > 1) ? ("i" + to_string(i) + ":") : "") + "reuse_buffer");
10671066
} else {
10681067
// TODO: If there is a kernel which requires reset internal buffer in the future,
10691068
// we'll need additional handle for that purpose like need_reset_output_memory
10701069
const bool need_reset = false;
10711070
if (i < _intermediates_memory.size()) {
1072-
_intermediates_memory[i] = allocate_internal_buffer(ibuf_layouts[i], i, need_reset, need_lockable);
1071+
_intermediates_memory[i] = allocate_internal_buffer(buffer_descs[i].m_layout, i, need_reset, need_lockable);
10731072
max_intermediates_memory_sizes[i] = _intermediates_memory[i]->size();
10741073
} else {
10751074
// i-th layout has not been allocated yet
1076-
_intermediates_memory.push_back(allocate_internal_buffer(ibuf_layouts[i], i, need_reset, need_lockable));
1075+
_intermediates_memory.push_back(allocate_internal_buffer(buffer_descs[i].m_layout, i, need_reset, need_lockable));
10771076
max_intermediates_memory_sizes.push_back(_intermediates_memory[i]->size());
10781077
}
10791078
GPU_DEBUG_CODE(memalloc_info +=
@@ -2204,16 +2203,16 @@ memory::ptr primitive_inst::allocate_internal_buffer(const layout& layout, size_
22042203
void primitive_inst::allocate_internal_buffers(bool reset) {
22052204
if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
22062205
return;
2207-
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
2208-
if (ibuf_layouts.empty())
2206+
const auto& buffer_descs = _impl->get_internal_buffer_descs(*_impl_params);
2207+
if (buffer_descs.empty())
22092208
return;
22102209

22112210
// allocate intermediate memory for the updated layout of buffer
22122211
std::vector<memory::ptr> intermediates_memory;
2213-
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
2214-
if (ibuf_layouts[i].get_linear_size() == 0)
2212+
for (size_t i = 0; i < buffer_descs.size(); ++i) {
2213+
if (buffer_descs[i].m_layout.get_linear_size() == 0)
22152214
continue;
2216-
intermediates_memory.push_back(allocate_internal_buffer(ibuf_layouts[i], i, reset));
2215+
intermediates_memory.push_back(allocate_internal_buffer(buffer_descs[i].m_layout, i, reset));
22172216
max_intermediates_memory_sizes.push_back(intermediates_memory[i]->size());
22182217
}
22192218
_intermediates_memory = intermediates_memory;

0 commit comments

Comments
 (0)