Skip to content

Commit 56a07b0

Browse files
committed
#2201: temperedlb: add data structures to track other rank's clusters
1 parent 4e217ac commit 56a07b0

File tree

2 files changed

+50
-1
lines changed

2 files changed

+50
-1
lines changed

src/vt/vrt/collection/balance/temperedlb/temperedlb.cc

+48-1
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,10 @@ void TemperedLB::doLBStages(LoadType start_imb) {
666666
underloaded_.clear();
667667
load_info_.clear();
668668
is_overloaded_ = is_underloaded_ = false;
669+
other_rank_clusters_.clear();
670+
671+
// Not clearing shared_block_size_ because this never changes and
672+
// the knowledge might be useful
669673
}
670674

671675
vt_debug_print(
@@ -686,11 +690,12 @@ void TemperedLB::doLBStages(LoadType start_imb) {
686690

687691
computeClusterSummary();
688692

693+
// Verbose printing about local clusters
689694
for (auto const& [shared_id, value] : cur_blocks_) {
690695
auto const& [shared_bytes, cluster_load] = value;
691696
vt_print(
692697
temperedlb,
693-
"Cluster: id={}, bytes={}, load={}\n",
698+
"Local cluster: id={}, bytes={}, load={}\n",
694699
shared_id, shared_bytes, cluster_load
695700
);
696701
}
@@ -714,6 +719,28 @@ void TemperedLB::doLBStages(LoadType start_imb) {
714719
vtAbort("TemperedLB:: Unsupported inform type");
715720
}
716721

722+
// Some very verbose printing about all remote clusters we know about that
723+
// we can shut off later
724+
for (auto const& [node, clusters] : other_rank_clusters_) {
725+
for (auto const& [shared_id, value] : clusters) {
726+
auto const& [shared_bytes, cluster_load] = value;
727+
vt_print(
728+
temperedlb,
729+
"Remote cluster: node={}, id={}, bytes={}, load={}\n",
730+
node, shared_id, shared_bytes, cluster_load
731+
);
732+
}
733+
}
734+
735+
// Move remove cluster information to shared_block_size_ so we have all
736+
// the sizes in the same place
737+
for (auto const& [node, clusters] : other_rank_clusters_) {
738+
for (auto const& [shared_id, value] : clusters) {
739+
auto const& [shared_bytes, _] = value;
740+
shared_block_size_[shared_id] = shared_bytes;
741+
}
742+
}
743+
717744
// Execute transfer stage
718745
switch (transfer_type_) {
719746
case TransferTypeEnum::Original:
@@ -1049,6 +1076,16 @@ void TemperedLB::propagateIncomingAsync(LoadMsgAsync* msg) {
10491076
trial_, iter_, k_max_, k_cur_async, from_node, msg->getNodeLoad().size()
10501077
);
10511078

1079+
auto const this_node = theContext()->getNode();
1080+
for (auto const& [node, clusters] : msg->getNodeClusterSummary()) {
1081+
if (
1082+
node != this_node and
1083+
other_rank_clusters_.find(node) == other_rank_clusters_.end()
1084+
) {
1085+
other_rank_clusters_[node] = clusters;
1086+
}
1087+
}
1088+
10521089
for (auto&& elm : msg->getNodeLoad()) {
10531090
if (load_info_.find(elm.first) == load_info_.end()) {
10541091
load_info_[elm.first] = elm.second;
@@ -1083,6 +1120,16 @@ void TemperedLB::propagateIncomingSync(LoadMsgSync* msg) {
10831120
trial_, iter_, k_max_, k_cur_, from_node, msg->getNodeLoad().size()
10841121
);
10851122

1123+
auto const this_node = theContext()->getNode();
1124+
for (auto const& [node, clusters] : msg->getNodeClusterSummary()) {
1125+
if (
1126+
node != this_node and
1127+
other_rank_clusters_.find(node) == other_rank_clusters_.end()
1128+
) {
1129+
other_rank_clusters_[node] = clusters;
1130+
}
1131+
}
1132+
10861133
for (auto&& elm : msg->getNodeLoad()) {
10871134
if (new_load_info_.find(elm.first) == new_load_info_.end()) {
10881135
new_load_info_[elm.first] = elm.second;

src/vt/vrt/collection/balance/temperedlb/temperedlb.h

+2
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ struct TemperedLB : BaseLB {
228228
std::unordered_map<ObjIDType, BytesType> obj_working_bytes_;
229229
/// Current assignment memory/load summary
230230
ClusterSummaryType cur_blocks_;
231+
/// Clusters that we know of on other ranks (might be out of date)
232+
std::unordered_map<NodeType, ClusterSummaryType> other_rank_clusters_;
231233
/// User-defined memory threshold
232234
BytesType mem_thresh_ = 0;
233235
};

0 commit comments

Comments
 (0)