@@ -2403,8 +2403,15 @@ void TemperedLB::considerSubClustersAfterLock(MsgSharedPtr<LockedInfoMsg> msg) {
2403
2403
}
2404
2404
2405
2405
void TemperedLB::considerSwapsAfterLock (MsgSharedPtr<LockedInfoMsg> msg) {
2406
+ consider_swaps_counter_++;
2406
2407
is_swapping_ = true ;
2407
2408
2409
+ vt_debug_print (
2410
+ verbose, temperedlb,
2411
+ " considerSwapsAfterLock: consider_swaps_counter_={} start\n " ,
2412
+ consider_swaps_counter_
2413
+ );
2414
+
2408
2415
auto const this_node = theContext ()->getNode ();
2409
2416
2410
2417
NodeInfo this_info{
@@ -2532,6 +2539,11 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr<LockedInfoMsg> msg) {
2532
2539
] = removeClusterToSend (src_shared_id);
2533
2540
2534
2541
runInEpochRooted (" giveCluster" , [&]{
2542
+ vt_debug_print (
2543
+ verbose, temperedlb,
2544
+ " considerSwapsAfterLock: giveCluster swapping {} for {}, epoch={:x}\n " ,
2545
+ src_shared_id, try_shared_id, theMsg ()->getEpoch ()
2546
+ );
2535
2547
proxy_[try_rank].template send <&TemperedLB::giveCluster>(
2536
2548
this_node,
2537
2549
give_shared_blocks_size,
@@ -2556,7 +2568,14 @@ void TemperedLB::considerSwapsAfterLock(MsgSharedPtr<LockedInfoMsg> msg) {
2556
2568
2557
2569
proxy_[try_rank].template send <&TemperedLB::releaseLock>();
2558
2570
2571
+ vt_debug_print (
2572
+ verbose, temperedlb,
2573
+ " considerSwapsAfterLock: consider_swaps_counter_={} finish\n " ,
2574
+ consider_swaps_counter_
2575
+ );
2576
+
2559
2577
is_swapping_ = false ;
2578
+ consider_swaps_counter_--;
2560
2579
2561
2580
if (pending_actions_.size () > 0 ) {
2562
2581
auto action = pending_actions_.back ();
@@ -2618,8 +2637,10 @@ void TemperedLB::giveCluster(
2618
2637
2619
2638
vt_debug_print (
2620
2639
normal , temperedlb,
2621
- " giveCluster: total memory usage={}, shared blocks here={}, "
2622
- " memory_threshold={}, give_cluster={}, take_cluster={}\n " , computeMemoryUsage (),
2640
+ " giveCluster: from_rank={}, epoch={:x} total memory usage={}, shared blocks here={}, "
2641
+ " memory_threshold={}, give_cluster={}, take_cluster={}\n " ,
2642
+ from_rank, theMsg ()->getEpoch (),
2643
+ computeMemoryUsage (),
2623
2644
getSharedBlocksHere ().size (), mem_thresh_,
2624
2645
give_shared_blocks_size.begin ()->first , take_cluster
2625
2646
);
@@ -2650,8 +2671,8 @@ void TemperedLB::lockObtained(LockedInfoMsg* in_msg) {
2650
2671
2651
2672
vt_debug_print (
2652
2673
normal , temperedlb,
2653
- " lockObtained: is_locked_={}, is_subclustering_={}\n " ,
2654
- is_locked_, is_subclustering_
2674
+ " lockObtained: is_locked_={}, is_subclustering_={}, is_swapping_={} \n " ,
2675
+ is_locked_, is_subclustering_, is_swapping_
2655
2676
);
2656
2677
2657
2678
auto cur_epoch = theMsg ()->getEpoch ();
@@ -2678,6 +2699,12 @@ void TemperedLB::lockObtained(LockedInfoMsg* in_msg) {
2678
2699
} else if (is_swapping_) {
2679
2700
pending_actions_.push_back (action);
2680
2701
} else {
2702
+ vt_debug_print (
2703
+ normal , temperedlb,
2704
+ " lockObtained: running action immediately\n "
2705
+ );
2706
+
2707
+
2681
2708
action ();
2682
2709
}
2683
2710
}
0 commit comments