Skip to content

Commit 23bd9c3

Browse files
cwschillycz4rs
authored andcommitted
#2201: wip: fix review comments; add collection_id to synthetic data
1 parent 63ef499 commit 23bd9c3

File tree

6 files changed

+45
-27
lines changed

6 files changed

+45
-27
lines changed

src/vt/vrt/collection/balance/temperedlb/temperedlb.cc

+38-19
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ Default: true
257257
Description:
258258
If the final iteration of a trial has a worse imbalance than any earlier
259259
iteration, it will roll back to the iteration with the best imbalance.
260+
If transfer_strategy is SwapClusters, rollback is automatically set to false.
260261
)"
261262
},
262263
{
@@ -291,32 +292,32 @@ Description: α in the work model (load in work model)
291292
"beta",
292293
R"(
293294
Values: <double>
294-
Defaut: 1.0
295+
Defaut: 0.0
295296
Description: β in the work model (inter-node communication in work model)
296297
)"
297298
},
298299
{
299-
"epsilon",
300+
"gamma",
300301
R"(
301302
Values: <double>
302-
Defaut: 1.0
303-
Description: ε in the work model (memory term in work model)
303+
Defaut: 0.0
304+
Description: γ in the work model (intra-node communication in work model)
304305
)"
305306
},
306307
{
307308
"delta",
308309
R"(
309310
Values: <double>
310-
Defaut: 1.0
311+
Defaut: 0.0
311312
Description: δ in the work model (shared-memory-edges in work model)
312313
)"
313314
},
314315
{
315-
"gamma",
316+
"epsilon",
316317
R"(
317318
Values: <double>
318-
Defaut: 1.0
319-
Description: γ in the work model (intra-node communication in work model)
319+
Defaut: infinity
320+
Description: ε in the work model (memory term in work model)
320321
)"
321322
}
322323
};
@@ -456,6 +457,10 @@ void TemperedLB::inputParams(balance::ConfigEntry* config) {
456457
);
457458
transfer_type_ = transfer_type_converter_.getFromConfig(config, transfer_type_);
458459

460+
if (transfer_type_ == TransferTypeEnum::SwapClusters) {
461+
rollback_ = false;
462+
}
463+
459464
balance::LBArgsEnumConverter<ObjectOrderEnum> obj_ordering_converter_(
460465
"ordering", "ObjectOrderEnum", {
461466
{ObjectOrderEnum::Arbitrary, "Arbitrary"},
@@ -1066,10 +1071,22 @@ void TemperedLB::doLBStages(LoadType start_imb) {
10661071
if (first_iter) {
10671072
// Copy this node's object assignments to a local, mutable copy
10681073
cur_objs_.clear();
1074+
int total_num_objs = 0;
1075+
int num_migratable_objs = 0;
10691076
for (auto obj : *load_model_) {
1070-
cur_objs_[obj] = getModeledValue(obj);
1077+
total_num_objs++;
1078+
if (obj.isMigratable()) {
1079+
num_migratable_objs++;
1080+
cur_objs_[obj] = getModeledValue(obj);
1081+
}
10711082
}
10721083

1084+
vt_debug_print(
1085+
normal, temperedlb,
1086+
"TemperedLB::doLBStages: Found {} migratable objects out of {}.\n",
1087+
num_migratable_objs, total_num_objs
1088+
);
1089+
10731090
send_edges_.clear();
10741091
recv_edges_.clear();
10751092
bool has_comm = false;
@@ -1326,12 +1343,14 @@ void TemperedLB::doLBStages(LoadType start_imb) {
13261343
);
13271344
}
13281345

1329-
auto remote_block_count = getRemoteBlockCountHere();
1330-
runInEpochCollective("TemperedLB::doLBStages -> compute unhomed", [=] {
1331-
proxy_.allreduce<&TemperedLB::remoteBlockCountHandler, collective::PlusOp>(
1332-
remote_block_count
1333-
);
1334-
});
1346+
// Skip this block when not using SwapClusters
1347+
if (transfer_type_ == TransferTypeEnum::SwapClusters) {
1348+
auto remote_block_count = getRemoteBlockCountHere();
1349+
runInEpochCollective("TemperedLB::doLBStages -> compute unhomed", [=] {
1350+
proxy_.allreduce<&TemperedLB::remoteBlockCountHandler,
1351+
collective::PlusOp>(remote_block_count);
1352+
});
1353+
}
13351354
} else if (this_node == 0) {
13361355
vt_debug_print(
13371356
terse, temperedlb,
@@ -2269,7 +2288,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr<LockedInfoMsg> msg) {
22692288
try_new_mem += src_cluster.cluster_footprint;
22702289

22712290
if (try_new_mem > mem_thresh_) {
2272-
return - std::numeric_limits<double>::infinity();
2291+
return - epsilon;
22732292
}
22742293

22752294
BytesType src_new_mem = current_memory_usage_;
@@ -2289,7 +2308,7 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr<LockedInfoMsg> msg) {
22892308
src_new_mem -= src_cluster.cluster_footprint;
22902309

22912310
if (src_new_mem > mem_thresh_) {
2292-
return - std::numeric_limits<double>::infinity();
2311+
return - epsilon;
22932312
}
22942313

22952314
double const src_new_work =
@@ -2596,12 +2615,12 @@ void TemperedLB::swapClusters() {
25962615

25972616
// Necessary but not sufficient check regarding memory bounds
25982617
if (try_mem - try_cluster.bytes + src_cluster.bytes > mem_thresh_) {
2599-
return - std::numeric_limits<double>::infinity();
2618+
return - epsilon;
26002619
}
26012620

26022621
auto const src_mem = current_memory_usage_;
26032622
if (src_mem + try_cluster.bytes - src_cluster.bytes > mem_thresh_) {
2604-
return - std::numeric_limits<double>::infinity();
2623+
return - epsilon;
26052624
}
26062625

26072626
auto const& try_info = load_info_.find(try_rank)->second;

src/vt/vrt/collection/balance/temperedlb/temperedlb.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,7 @@ struct TemperedLB : BaseLB {
417417
double beta = 0.0;
418418
double gamma = 0.0;
419419
double delta = 0.0;
420-
double epsilon = 0.0;
420+
double epsilon = std::numeric_limits<double>::infinity();
421421
std::vector<bool> propagated_k_;
422422
std::mt19937 gen_propagate_;
423423
std::mt19937 gen_sample_;
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"metadata":{"type":"LBDatafile","rank":0},"phases":[{"id":0,"tasks":[{"entity":{"home":0,"id":1,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":3,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":2,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":0,"migratable":true,"type":"object"},"node":0,"resource":"cpu","time":1.0,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":5},"messages":1,"from":{"type":"object","id":0},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":4},"messages":1,"from":{"type":"object","id":1},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":2},"messages":1,"from":{"type":"object","id":3},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":3},"bytes":0.5}]}]}
1+
{"metadata":{"type":"LBDatafile","rank":0},"phases":[{"id":0,"tasks":[{"entity":{"home":0,"id":1,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":3,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":2,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":0.5,"user_defined":{"shared_id":1,"shared_bytes":9.0,"home_rank":0}},{"entity":{"home":0,"id":0,"migratable":true,"collection_id":7,"type":"object"},"node":0,"resource":"cpu","time":1.0,"user_defined":{"shared_id":0,"shared_bytes":9.0,"home_rank":0}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":5},"messages":1,"from":{"type":"object","id":0},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":4},"messages":1,"from":{"type":"object","id":1},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":2},"messages":1,"from":{"type":"object","id":3},"bytes":1.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":3},"bytes":0.5}]}]}
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"metadata":{"type":"LBDatafile","rank":1},"phases":[{"id":0,"tasks":[{"entity":{"home":1,"id":5,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":2.0,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":4,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":7,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":6,"migratable":true,"type":"object"},"node":1,"resource":"cpu","time":1.0,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":1},"messages":1,"from":{"type":"object","id":4},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":5},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":7},"bytes":1.0}]}]}
1+
{"metadata":{"type":"LBDatafile","rank":1},"phases":[{"id":0,"tasks":[{"entity":{"home":1,"id":5,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":2.0,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":4,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":2,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":7,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":0.5,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}},{"entity":{"home":1,"id":6,"migratable":true,"collection_id":7,"type":"object"},"node":1,"resource":"cpu","time":1.0,"user_defined":{"shared_id":3,"shared_bytes":9.0,"home_rank":1}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":1},"messages":1,"from":{"type":"object","id":4},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":8},"messages":1,"from":{"type":"object","id":5},"bytes":2.0},{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":7},"bytes":1.0}]}]}
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"metadata":{"type":"LBDatafile","rank":2},"phases":[{"id":0,"tasks":[{"entity":{"home":2,"id":8,"migratable":true,"type":"object"},"node":2,"resource":"cpu","time":1.5,"user_defined":{"shared_id":4,"shared_bytes":9.0,"home_rank":2}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":8},"bytes":1.5}]}]}
1+
{"metadata":{"type":"LBDatafile","rank":2},"phases":[{"id":0,"tasks":[{"entity":{"home":2,"id":8,"migratable":true,"collection_id":7,"type":"object"},"node":2,"resource":"cpu","time":1.5,"user_defined":{"shared_id":4,"shared_bytes":9.0,"home_rank":2}}],"communications":[{"type":"SendRecv","to":{"type":"object","id":6},"messages":1,"from":{"type":"object","id":8},"bytes":1.5}]}]}

tests/unit/lb/test_temperedlb.cc

+3-4
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ std::string writeTemperedLBConfig(std::string transfer_strategy,
2727
" gamma=" << gamma <<
2828
" delta=" << delta;
2929
if (transfer_strategy == "SwapClusters") {
30-
cfg_file_ << " rollback=false";
3130
if (mem_constraints) {
3231
cfg_file_ << " memory_threshold=20.0";
3332
} else {
@@ -40,6 +39,9 @@ std::string writeTemperedLBConfig(std::string transfer_strategy,
4039
}
4140

4241
void runTemperedLBTest(std::string config_file, double expected_imb = 0.0) {
42+
// Clear the LB config
43+
vrt::collection::balance::ReadLBConfig::clear();
44+
4345
// Set configuration
4446
theConfig()->vt_lb = true;
4547
theConfig()->vt_lb_data_in = true;
@@ -59,9 +61,6 @@ void runTemperedLBTest(std::string config_file, double expected_imb = 0.0) {
5961

6062
// Assert that temperedLB found the correct imbalance
6163
EXPECT_EQ(phase_info->imb_load_post_lb, expected_imb);
62-
63-
// Clear the LB config ahead of next test
64-
vrt::collection::balance::ReadLBConfig::clear();
6564
}
6665

6766
TEST_F(TestTemperedLB, test_load_only) {

0 commit comments

Comments
 (0)