@@ -666,6 +666,10 @@ void TemperedLB::doLBStages(LoadType start_imb) {
666
666
underloaded_.clear ();
667
667
load_info_.clear ();
668
668
is_overloaded_ = is_underloaded_ = false ;
669
+ other_rank_clusters_.clear ();
670
+
671
+ // Not clearing shared_block_size_ because this never changes and
672
+ // the knowledge might be useful
669
673
}
670
674
671
675
vt_debug_print (
@@ -686,11 +690,12 @@ void TemperedLB::doLBStages(LoadType start_imb) {
686
690
687
691
computeClusterSummary ();
688
692
693
+ // Verbose printing about local clusters
689
694
for (auto const & [shared_id, value] : cur_blocks_) {
690
695
auto const & [shared_bytes, cluster_load] = value;
691
696
vt_print (
692
697
temperedlb,
693
- " Cluster : id={}, bytes={}, load={}\n " ,
698
+ " Local cluster : id={}, bytes={}, load={}\n " ,
694
699
shared_id, shared_bytes, cluster_load
695
700
);
696
701
}
@@ -714,6 +719,28 @@ void TemperedLB::doLBStages(LoadType start_imb) {
714
719
vtAbort (" TemperedLB:: Unsupported inform type" );
715
720
}
716
721
722
+ // Some very verbose printing about all remote clusters we know about that
723
+ // we can shut off later
724
+ for (auto const & [node, clusters] : other_rank_clusters_) {
725
+ for (auto const & [shared_id, value] : clusters) {
726
+ auto const & [shared_bytes, cluster_load] = value;
727
+ vt_print (
728
+ temperedlb,
729
+ " Remote cluster: node={}, id={}, bytes={}, load={}\n " ,
730
+ node, shared_id, shared_bytes, cluster_load
731
+ );
732
+ }
733
+ }
734
+
735
+ // Move remove cluster information to shared_block_size_ so we have all
736
+ // the sizes in the same place
737
+ for (auto const & [node, clusters] : other_rank_clusters_) {
738
+ for (auto const & [shared_id, value] : clusters) {
739
+ auto const & [shared_bytes, _] = value;
740
+ shared_block_size_[shared_id] = shared_bytes;
741
+ }
742
+ }
743
+
717
744
// Execute transfer stage
718
745
switch (transfer_type_) {
719
746
case TransferTypeEnum::Original:
@@ -1049,6 +1076,16 @@ void TemperedLB::propagateIncomingAsync(LoadMsgAsync* msg) {
1049
1076
trial_, iter_, k_max_, k_cur_async, from_node, msg->getNodeLoad ().size ()
1050
1077
);
1051
1078
1079
+ auto const this_node = theContext ()->getNode ();
1080
+ for (auto const & [node, clusters] : msg->getNodeClusterSummary ()) {
1081
+ if (
1082
+ node != this_node and
1083
+ other_rank_clusters_.find (node) == other_rank_clusters_.end ()
1084
+ ) {
1085
+ other_rank_clusters_[node] = clusters;
1086
+ }
1087
+ }
1088
+
1052
1089
for (auto && elm : msg->getNodeLoad ()) {
1053
1090
if (load_info_.find (elm.first ) == load_info_.end ()) {
1054
1091
load_info_[elm.first ] = elm.second ;
@@ -1083,6 +1120,16 @@ void TemperedLB::propagateIncomingSync(LoadMsgSync* msg) {
1083
1120
trial_, iter_, k_max_, k_cur_, from_node, msg->getNodeLoad ().size ()
1084
1121
);
1085
1122
1123
+ auto const this_node = theContext ()->getNode ();
1124
+ for (auto const & [node, clusters] : msg->getNodeClusterSummary ()) {
1125
+ if (
1126
+ node != this_node and
1127
+ other_rank_clusters_.find (node) == other_rank_clusters_.end ()
1128
+ ) {
1129
+ other_rank_clusters_[node] = clusters;
1130
+ }
1131
+ }
1132
+
1086
1133
for (auto && elm : msg->getNodeLoad ()) {
1087
1134
if (new_load_info_.find (elm.first ) == new_load_info_.end ()) {
1088
1135
new_load_info_[elm.first ] = elm.second ;
0 commit comments