@@ -455,9 +455,9 @@ void TemperedLB::inputParams(balance::ConfigEntry* config) {
455
455
void TemperedLB::runLB (LoadType total_load) {
456
456
bool should_lb = false ;
457
457
458
+ // Compute load statistics
458
459
this_load = total_load;
459
460
stats = *getStats ();
460
-
461
461
auto const avg = stats.at (lb::Statistic::Rank_load_modeled).at (
462
462
lb::StatisticQuantity::avg
463
463
);
@@ -481,10 +481,12 @@ void TemperedLB::runLB(LoadType total_load) {
481
481
target_max_load_ = avg;
482
482
}
483
483
484
- if (avg > 0.0000000001 ) {
484
+ // Use an absolute minimal bound on average load to load-balance
485
+ if (avg > 1e-10 ) {
485
486
should_lb = max > (run_temperedlb_tolerance + 1.0 ) * target_max_load_;
486
487
}
487
488
489
+ // Report statistics from head rank
488
490
if (theContext ()->getNode () == 0 ) {
489
491
vt_debug_print (
490
492
terse, temperedlb,
@@ -501,6 +503,7 @@ void TemperedLB::runLB(LoadType total_load) {
501
503
}
502
504
}
503
505
506
+ // Perform load rebalancing when deemed necessary
504
507
if (should_lb) {
505
508
doLBStages (imb);
506
509
}
@@ -814,15 +817,16 @@ void TemperedLB::propagateRound(uint8_t k_cur, bool sync, EpochType epoch) {
814
817
selected.insert (this_node);
815
818
}
816
819
820
+ // Determine fanout factor capped by number of nodes
817
821
auto const fanout = std::min (f_, static_cast <decltype (f_)>(num_nodes - 1 ));
818
-
819
822
vt_debug_print (
820
823
verbose, temperedlb,
821
824
" TemperedLB::propagateRound: trial={}, iter={}, k_max={}, k_cur={}, "
822
825
" selected.size()={}, fanout={}\n " ,
823
826
trial_, iter_, k_max_, k_cur, selected.size (), fanout
824
827
);
825
828
829
+ // Iterate over fanout factor
826
830
for (int i = 0 ; i < fanout; i++) {
827
831
// This implies full knowledge of all processors
828
832
if (selected.size () >= static_cast <size_t >(num_nodes)) {
@@ -849,6 +853,7 @@ void TemperedLB::propagateRound(uint8_t k_cur, bool sync, EpochType epoch) {
849
853
850
854
// Send message with load
851
855
if (sync ) {
856
+ // Message in synchronous mode
852
857
auto msg = makeMessage<LoadMsgSync>(this_node, load_info_);
853
858
if (epoch != no_epoch) {
854
859
envelopeSetEpoch (msg->env , epoch);
@@ -858,6 +863,7 @@ void TemperedLB::propagateRound(uint8_t k_cur, bool sync, EpochType epoch) {
858
863
LoadMsgSync, &TemperedLB::propagateIncomingSync
859
864
>(msg.get ());
860
865
} else {
866
+ // Message in asynchronous mode
861
867
auto msg = makeMessage<LoadMsgAsync>(this_node, load_info_, k_cur);
862
868
if (epoch != no_epoch) {
863
869
envelopeSetEpoch (msg->env , epoch);
@@ -1216,8 +1222,10 @@ std::vector<TemperedLB::ObjIDType> TemperedLB::orderObjects(
1216
1222
void TemperedLB::decide () {
1217
1223
auto lazy_epoch = theTerm ()->makeEpochCollective (" TemperedLB: decide" );
1218
1224
1225
+ // Initialize transfer and rejection counters
1219
1226
int n_transfers = 0 , n_rejected = 0 ;
1220
1227
1228
+ // Try to migrate objects only from overloaded objects
1221
1229
if (is_overloaded_) {
1222
1230
std::vector<NodeType> under = makeUnderloaded ();
1223
1231
std::unordered_map<NodeType, ObjsType> migrate_objs;
@@ -1250,6 +1258,7 @@ void TemperedLB::decide() {
1250
1258
}
1251
1259
// Rebuild the CMF with the new loads taken into account
1252
1260
auto cmf = createCMF (under);
1261
+
1253
1262
// Select a node using the CMF
1254
1263
auto const selected_node = sampleFromCMF (under, cmf);
1255
1264
@@ -1259,16 +1268,15 @@ void TemperedLB::decide() {
1259
1268
selected_node, load_info_.size ()
1260
1269
);
1261
1270
1271
+ // Find load of selected node
1262
1272
auto load_iter = load_info_.find (selected_node);
1263
1273
vtAssert (load_iter != load_info_.end (), " Selected node not found" );
1264
-
1265
- // The load of the node selected
1266
1274
auto & selected_load = load_iter->second ;
1267
1275
1276
+ // Evaluate criterion for proposed transfer
1268
1277
bool eval = Criterion (criterion_)(
1269
1278
this_new_load_, selected_load, obj_load, target_max_load_
1270
1279
);
1271
-
1272
1280
vt_debug_print (
1273
1281
verbose, temperedlb,
1274
1282
" TemperedLB::decide: trial={}, iter={}, under.size()={}, "
@@ -1288,9 +1296,10 @@ void TemperedLB::decide() {
1288
1296
eval
1289
1297
);
1290
1298
1299
+ // Decide about proposed migration based on criterion evaluation
1291
1300
if (eval) {
1292
1301
++n_transfers;
1293
- // transfer the object load in seconds
1302
+ // Transfer the object load in seconds
1294
1303
// to match the object load units on the receiving end
1295
1304
migrate_objs[selected_node][obj_id] = obj_load;
1296
1305
@@ -1315,7 +1324,6 @@ void TemperedLB::decide() {
1315
1324
auto node = migration.first ;
1316
1325
lazyMigrateObjsTo (lazy_epoch, node, migration.second );
1317
1326
}
1318
-
1319
1327
} else {
1320
1328
// do nothing (underloaded-based algorithm), waits to get work from
1321
1329
// overloaded nodes
0 commit comments