#2201: temperedlb: add data structures to track other rank's clusters

lifflander · lifflander · commit 56a07b0cdf1e · 2023-11-28T22:17:03.000-08:00
diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc
@@ -666,6 +666,10 @@ void TemperedLB::doLBStages(LoadType start_imb) {
         underloaded_.clear();
         load_info_.clear();
         is_overloaded_ = is_underloaded_ = false;
+        other_rank_clusters_.clear();
+
+        // Not clearing shared_block_size_ because this never changes and
+        // the knowledge might be useful
       }
 
       vt_debug_print(
@@ -686,11 +690,12 @@ void TemperedLB::doLBStages(LoadType start_imb) {
 
         computeClusterSummary();
 
+        // Verbose printing about local clusters
         for (auto const& [shared_id, value] : cur_blocks_) {
           auto const& [shared_bytes, cluster_load] = value;
           vt_print(
             temperedlb,
-            "Cluster: id={}, bytes={}, load={}\n",
+            "Local cluster: id={}, bytes={}, load={}\n",
             shared_id, shared_bytes, cluster_load
           );
         }
@@ -714,6 +719,28 @@ void TemperedLB::doLBStages(LoadType start_imb) {
         vtAbort("TemperedLB:: Unsupported inform type");
       }
 
+      // Some very verbose printing about all remote clusters we know about that
+      // we can shut off later
+      for (auto const& [node, clusters] : other_rank_clusters_) {
+        for (auto const& [shared_id, value] : clusters) {
+          auto const& [shared_bytes, cluster_load] = value;
+          vt_print(
+            temperedlb,
+            "Remote cluster: node={}, id={}, bytes={}, load={}\n",
+            node, shared_id, shared_bytes, cluster_load
+          );
+        }
+      }
+
+      // Move remove cluster information to shared_block_size_ so we have all
+      // the sizes in the same place
+      for (auto const& [node, clusters] : other_rank_clusters_) {
+        for (auto const& [shared_id, value] : clusters) {
+          auto const& [shared_bytes, _] = value;
+          shared_block_size_[shared_id] = shared_bytes;
+        }
+      }
+
       // Execute transfer stage
       switch (transfer_type_) {
       case TransferTypeEnum::Original:
@@ -1049,6 +1076,16 @@ void TemperedLB::propagateIncomingAsync(LoadMsgAsync* msg) {
     trial_, iter_, k_max_, k_cur_async, from_node, msg->getNodeLoad().size()
   );
 
+  auto const this_node = theContext()->getNode();
+  for (auto const& [node, clusters] : msg->getNodeClusterSummary()) {
+    if (
+      node != this_node and
+      other_rank_clusters_.find(node) == other_rank_clusters_.end()
+    ) {
+      other_rank_clusters_[node] = clusters;
+    }
+  }
+
   for (auto&& elm : msg->getNodeLoad()) {
     if (load_info_.find(elm.first) == load_info_.end()) {
       load_info_[elm.first] = elm.second;
@@ -1083,6 +1120,16 @@ void TemperedLB::propagateIncomingSync(LoadMsgSync* msg) {
     trial_, iter_, k_max_, k_cur_, from_node, msg->getNodeLoad().size()
   );
 
+  auto const this_node = theContext()->getNode();
+  for (auto const& [node, clusters] : msg->getNodeClusterSummary()) {
+    if (
+      node != this_node and
+      other_rank_clusters_.find(node) == other_rank_clusters_.end()
+    ) {
+      other_rank_clusters_[node] = clusters;
+    }
+  }
+
   for (auto&& elm : msg->getNodeLoad()) {
     if (new_load_info_.find(elm.first) == new_load_info_.end()) {
       new_load_info_[elm.first] = elm.second;
diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h
@@ -228,6 +228,8 @@ struct TemperedLB : BaseLB {
   std::unordered_map<ObjIDType, BytesType> obj_working_bytes_;
   /// Current assignment memory/load summary
   ClusterSummaryType cur_blocks_;
+  /// Clusters that we know of on other ranks (might be out of date)
+  std::unordered_map<NodeType, ClusterSummaryType> other_rank_clusters_;
   /// User-defined memory threshold
   BytesType mem_thresh_ = 0;
 };