Skip to content

Commit 8aca88a

Browse files
Split initial memory dependencies and runtime ones to reduce overheads in program_node and primitive_instance
1 parent 565c2b6 commit 8aca88a

File tree

7 files changed

+66
-55
lines changed

7 files changed

+66
-55
lines changed

src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp

+52-9
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,54 @@ class engine;
2626
using primitive_id = std::string;
2727
using memory_ptr = std::shared_ptr<memory>;
2828

29+
template<typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>>
30+
class memory_restricter {
31+
private:
32+
const std::unordered_set<Key, Hash, KeyEqual>& set1; // Const reference to external set
33+
std::unordered_set<Key, Hash, KeyEqual> set2; // Internal modifiable set
34+
static std::unordered_set<Key, Hash, KeyEqual> empty_set; // Static empty set for default
35+
36+
public:
37+
// Default constructor initializes set1 with an empty set
38+
memory_restricter()
39+
: set1(empty_set) {}
40+
41+
// Constructor to initialize with a const reference for set1
42+
explicit memory_restricter(const std::unordered_set<Key, Hash, KeyEqual>& externalSet)
43+
: set1(externalSet) {}
44+
45+
// Insert into set2 (set1 is read-only)
46+
void insert(const Key& key) {
47+
if (set1.find(key) == set1.end())
48+
set2.insert(key);
49+
}
50+
51+
// Check existence in either set
52+
bool contains(const Key& key) const {
53+
return set1.find(key) != set1.end() || set2.find(key) != set2.end();
54+
}
55+
56+
// Total size of both sets
57+
size_t size() const {
58+
return set1.size() + set2.size();
59+
}
60+
61+
// Check if both sets are empty
62+
bool empty() const {
63+
return set1.empty() && set2.empty();
64+
}
65+
66+
// Iterate over both sets
67+
void for_each(void(*func)(const Key&)) const {
68+
for (const auto& key : set1) func(key);
69+
for (const auto& key : set2) func(key);
70+
}
71+
}; // end of memory_restricter
72+
73+
// Define the static empty_set
74+
template<typename Key, typename Hash, typename KeyEqual>
75+
std::unordered_set<Key, Hash, KeyEqual> memory_restricter<Key, Hash, KeyEqual>::empty_set(0); // minimize its memory usage
76+
2977
struct memory_user {
3078
size_t _unique_id;
3179
uint32_t _network_id;
@@ -112,7 +160,7 @@ struct padded_pool_comparer {
112160

113161
class memory_pool {
114162
memory_ptr alloc_memory(const layout& layout, allocation_type type, bool reset = true);
115-
static bool has_conflict(const memory_set&, const std::unordered_set<uint32_t>&, uint32_t network_id);
163+
static bool has_conflict(const memory_set&, const memory_restricter<uint32_t>&, uint32_t network_id);
116164

117165
std::multimap<uint64_t, memory_record> _non_padded_pool;
118166
std::map<layout, std::list<memory_record>, padded_pool_comparer> _padded_pool;
@@ -127,7 +175,7 @@ class memory_pool {
127175
const primitive_id& id,
128176
size_t unique_id,
129177
uint32_t network_id,
130-
const std::unordered_set<uint32_t>& restrictions,
178+
const memory_restricter<uint32_t>& restrictions,
131179
allocation_type type,
132180
bool reusable = true,
133181
bool reset = true,
@@ -137,21 +185,16 @@ class memory_pool {
137185
const primitive_id& prim_id,
138186
size_t unique_id,
139187
uint32_t network_id,
140-
const std::unordered_set<uint32_t>&,
188+
const memory_restricter<uint32_t>&,
141189
allocation_type type,
142190
bool reset = true,
143191
bool is_dynamic = false);
144192
memory_ptr get_from_padded_pool(const layout& layout,
145193
const primitive_id& prim_id,
146194
size_t unique_id,
147195
uint32_t network_id,
148-
const std::unordered_set<uint32_t>& restrictions,
196+
const memory_restricter<uint32_t>& restrictions,
149197
allocation_type type);
150-
memory_ptr get_from_across_networks_pool(const layout& layout,
151-
const primitive_id& id,
152-
size_t unique_id,
153-
uint32_t network_id,
154-
allocation_type type);
155198
void clear_pool_for_network(uint32_t network_id);
156199
void release_memory(memory* memory, const size_t& unique_id, primitive_id prim_id, uint32_t network_id);
157200

src/plugins/intel_gpu/src/graph/include/primitive_inst.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ class primitive_inst {
213213
_users = _network.get_primitives(users);
214214
}
215215

216-
const std::unordered_set<uint32_t>& get_runtime_memory_dependencies() const { return _runtime_memory_dependencies; }
216+
const memory_restricter<uint32_t>& get_runtime_memory_dependencies() const { return _runtime_memory_dependencies; }
217217

218218
const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
219219
// return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
@@ -298,7 +298,7 @@ class primitive_inst {
298298
memory_pool& pool,
299299
const program_node& _node,
300300
const kernel_impl_params& impl_params,
301-
const std::unordered_set<uint32_t>& memory_dependencies,
301+
const memory_restricter<uint32_t>& memory_dependencies,
302302
uint32_t net_id,
303303
bool is_internal,
304304
size_t idx = 0,
@@ -369,7 +369,7 @@ class primitive_inst {
369369
std::vector<primitive_inst*> _exec_deps;
370370

371371
// List of primitive ids that this primitive can't share memory buffers with
372-
std::unordered_set<uint32_t> _runtime_memory_dependencies;
372+
memory_restricter<uint32_t> _runtime_memory_dependencies;
373373

374374
// This is sub-network generated on demand to execute unfused primitives sequence instead of single fused primitive
375375
// Needed for dynamic path only, as fusion in some cases may be illegal, but it can't be checked on program build phase,

src/plugins/intel_gpu/src/graph/include/program_node.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ struct program_node {
207207
size_t get_dependency_index(const program_node& node) const;
208208
size_t get_user_index(const program_node& node) const;
209209

210-
std::unordered_set<uint32_t> get_memory_dependencies() const;
210+
const std::unordered_set<uint32_t>& get_memory_dependencies() const;
211211
void add_memory_dependency(size_t);
212212
void add_memory_dependency(std::vector<size_t>);
213213

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ static memory::ptr get_memory_from_pool(engine& _engine,
158158
const layout& layout,
159159
allocation_type type,
160160
bool reusable_across_network,
161-
const std::unordered_set<uint32_t>& memory_dependencies,
161+
const memory_restricter<uint32_t>& memory_dependencies,
162162
bool reset = true,
163163
memory* curr_memory = nullptr) {
164164
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(),
@@ -2351,7 +2351,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine,
23512351
memory_pool& pool,
23522352
const program_node& _node,
23532353
const kernel_impl_params& impl_params,
2354-
const std::unordered_set<uint32_t>& memory_dependencies,
2354+
const memory_restricter<uint32_t>& memory_dependencies,
23552355
uint32_t net_id,
23562356
bool is_internal,
23572357
size_t idx,

src/plugins/intel_gpu/src/graph/program.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,7 @@ std::string program::get_memory_dependencies_string() const {
766766
.append("(unique_id:")
767767
.append(std::to_string(node->get_unique_id()))
768768
.append(") restricted list: ");
769-
for (auto it : node->get_memory_dependencies())
769+
for (const auto& it : node->get_memory_dependencies())
770770
mem_dep = mem_dep.append(std::to_string(it)).append(",");
771771
mem_dep = mem_dep.append("\n");
772772
}
@@ -1708,7 +1708,7 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
17081708
pool,
17091709
*node,
17101710
*node->get_kernel_impl_params(),
1711-
node->get_memory_dependencies(),
1711+
memory_restricter<uint32_t>(node->get_memory_dependencies()),
17121712
0,
17131713
false,
17141714
0,

src/plugins/intel_gpu/src/graph/program_node.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ void program_node::remove_dependency(size_t idx) {
195195
dependencies.erase(dependencies.begin() + idx);
196196
}
197197

198-
std::unordered_set<uint32_t> program_node::get_memory_dependencies() const {
198+
const std::unordered_set<uint32_t>& program_node::get_memory_dependencies() const {
199199
// Monitor performance
200200
// if (org_id.find("split:Multiply_31737_fused_3_Multiply_split.out2") != std::string::npos) {
201201
// std::cout << "#########caught split:Multiply_31737_fused_3_Multiply_split.out2 here2!\n\n";

src/plugins/intel_gpu/src/runtime/memory_pool.cpp

+5-37
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ memory::ptr memory_pool::alloc_memory(const layout& layout, allocation_type type
3737
memory_pool::~memory_pool() {}
3838

3939
bool memory_pool::has_conflict(const memory_set& mem_cand,
40-
const std::unordered_set<uint32_t>& restrictions,
40+
const memory_restricter<uint32_t>& restrictions,
4141
uint32_t b_network_id) {
4242
for (const auto& mem_usr : mem_cand) {
43-
if (restrictions.find(mem_usr._unique_id) != restrictions.end())
43+
if (restrictions.contains(mem_usr._unique_id))
4444
return true;
4545
}
4646
return false;
@@ -151,7 +151,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
151151
const primitive_id& prim_id,
152152
size_t unique_id,
153153
uint32_t network_id,
154-
const std::unordered_set<uint32_t>& restrictions,
154+
const memory_restricter<uint32_t>& restrictions,
155155
allocation_type type,
156156
bool reset,
157157
bool is_dynamic) {
@@ -197,7 +197,7 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
197197
const primitive_id& prim_id,
198198
size_t unique_id,
199199
uint32_t network_id,
200-
const std::unordered_set<uint32_t>& restrictions,
200+
const memory_restricter<uint32_t>& restrictions,
201201
allocation_type type) {
202202
auto first_level_cache = _padded_pool.find(layout);
203203
if (first_level_cache != _padded_pool.end()) {
@@ -250,38 +250,6 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
250250
return mem;
251251
}
252252

253-
/*
254-
This is not reusable within one network or it's internal micro networks. But we can use this memory records
255-
between networks.
256-
*/
257-
memory::ptr memory_pool::get_from_across_networks_pool(const layout& layout,
258-
const primitive_id& prim_id,
259-
size_t unique_id,
260-
uint32_t network_id,
261-
allocation_type type) {
262-
const auto layout_bytes_count = layout.bytes_count();
263-
auto it = _no_reusable_pool.lower_bound(layout_bytes_count);
264-
265-
while (it != _no_reusable_pool.end()) {
266-
if (it->second._network_id != network_id &&
267-
it->second._type == type) { // don't use non reusable resources within the same network
268-
if (!has_conflict(it->second._users, {}, network_id)) {
269-
it->second._users.insert(memory_user(MEM_USER(unique_id, network_id, prim_id, layout_bytes_count)));
270-
auto ret_mem = _engine->reinterpret_buffer(*it->second._memory, layout);
271-
GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
272-
return ret_mem;
273-
}
274-
}
275-
++it;
276-
}
277-
auto mem = alloc_memory(layout, type);
278-
{
279-
_no_reusable_pool.emplace(layout_bytes_count,
280-
memory_record({{MEM_USER(unique_id, network_id, prim_id, layout_bytes_count)}}, mem, network_id, type));
281-
}
282-
return mem;
283-
}
284-
285253
memory::ptr memory_pool::get_memory(const layout& layout, allocation_type type, bool reset) {
286254
return alloc_memory(layout, type, reset);
287255
}
@@ -290,7 +258,7 @@ memory::ptr memory_pool::get_memory(const layout& layout,
290258
const primitive_id& prim_id,
291259
const size_t unique_id,
292260
uint32_t network_id,
293-
const std::unordered_set<uint32_t>& restrictions,
261+
const memory_restricter<uint32_t>& restrictions,
294262
allocation_type type,
295263
bool reusable_across_network,
296264
bool reset,

0 commit comments

Comments
 (0)