Skip to content

Commit 6e33dfb

Browse files
authored
expose metric to report reasons why full GCs were triggered (JuliaLang#55826)
Additional GC observability tool. This will help us to diagnose why some of our servers are triggering so many full GCs in certain circumstances.
1 parent 0dbb6eb commit 6e33dfb

File tree

4 files changed

+65
-2
lines changed

4 files changed

+65
-2
lines changed

base/timing.jl

+27
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,33 @@ function gc_page_utilization_data()
104104
return Base.unsafe_wrap(Array, page_utilization_raw, JL_GC_N_MAX_POOLS, own=false)
105105
end
106106

107+
# must be kept in sync with `src/gc-stock.h``
108+
const FULL_SWEEP_REASONS = [:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL, :FULL_SWEEP_REASON_FORCED_FULL_SWEEP,
109+
:FULL_SWEEP_REASON_USER_MAX_EXCEEDED, :FULL_SWEEP_REASON_LARGE_PROMOTION_RATE]
110+
111+
"""
112+
Base.full_sweep_reasons()
113+
114+
Return a dictionary of the number of times each full sweep reason has occurred.
115+
116+
The reasons are:
117+
- `:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL`: Full sweep was caused due to `always_full` being set in the GC debug environment
118+
- `:FULL_SWEEP_REASON_FORCED_FULL_SWEEP`: Full sweep was forced by `GC.gc(true)`
119+
- `:FULL_SWEEP_REASON_USER_MAX_EXCEEDED`: Full sweep was forced due to the system reaching the heap soft size limit
120+
- `:FULL_SWEEP_REASON_LARGE_PROMOTION_RATE`: Full sweep was forced by a large promotion rate across GC generations
121+
122+
Note that the set of reasons is not guaranteed to be stable across minor versions of Julia.
123+
"""
124+
function full_sweep_reasons()
125+
reason = cglobal(:jl_full_sweep_reasons, UInt64)
126+
reasons_as_array = Base.unsafe_wrap(Vector{UInt64}, reason, length(FULL_SWEEP_REASONS), own=false)
127+
d = Dict{Symbol, Int64}()
128+
for (i, r) in enumerate(FULL_SWEEP_REASONS)
129+
d[r] = reasons_as_array[i]
130+
end
131+
return d
132+
end
133+
107134
"""
108135
Base.jit_total_bytes()
109136

src/gc-stock.c

+13-2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ uv_sem_t gc_sweep_assists_needed;
4040
uv_mutex_t gc_queue_observer_lock;
4141
// Tag for sentinel nodes in bigval list
4242
uintptr_t gc_bigval_sentinel_tag;
43+
// Table recording number of full GCs due to each reason
44+
JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];
4345

4446
// Flag that tells us whether we need to support conservative marking
4547
// of objects.
@@ -3043,10 +3045,12 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
30433045
// we either free some space or get an OOM error.
30443046
if (gc_sweep_always_full) {
30453047
sweep_full = 1;
3048+
gc_count_full_sweep_reason(FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL);
30463049
}
30473050
if (collection == JL_GC_FULL && !prev_sweep_full) {
30483051
sweep_full = 1;
30493052
recollect = 1;
3053+
gc_count_full_sweep_reason(FULL_SWEEP_REASON_FORCED_FULL_SWEEP);
30503054
}
30513055
if (sweep_full) {
30523056
// these are the difference between the number of gc-perm bytes scanned
@@ -3182,10 +3186,17 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
31823186
}
31833187

31843188
double old_ratio = (double)promoted_bytes/(double)heap_size;
3185-
if (heap_size > user_max || old_ratio > 0.15)
3189+
if (heap_size > user_max) {
31863190
next_sweep_full = 1;
3187-
else
3191+
gc_count_full_sweep_reason(FULL_SWEEP_REASON_USER_MAX_EXCEEDED);
3192+
}
3193+
else if (old_ratio > 0.15) {
3194+
next_sweep_full = 1;
3195+
gc_count_full_sweep_reason(FULL_SWEEP_REASON_LARGE_PROMOTION_RATE);
3196+
}
3197+
else {
31883198
next_sweep_full = 0;
3199+
}
31893200
if (heap_size > user_max || thrashing)
31903201
under_pressure = 1;
31913202
// sweeping is over

src/gc-stock.h

+14
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,20 @@ FORCE_INLINE void gc_big_object_link(bigval_t *sentinel_node, bigval_t *node) JL
505505
sentinel_node->next = node;
506506
}
507507

508+
// Must be kept in sync with `base/timing.jl`
509+
#define FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL (0)
510+
#define FULL_SWEEP_REASON_FORCED_FULL_SWEEP (1)
511+
#define FULL_SWEEP_REASON_USER_MAX_EXCEEDED (2)
512+
#define FULL_SWEEP_REASON_LARGE_PROMOTION_RATE (3)
513+
#define FULL_SWEEP_NUM_REASONS (4)
514+
515+
extern JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];
516+
STATIC_INLINE void gc_count_full_sweep_reason(int reason) JL_NOTSAFEPOINT
517+
{
518+
assert(reason >= 0 && reason < FULL_SWEEP_NUM_REASONS);
519+
jl_full_sweep_reasons[reason]++;
520+
}
521+
508522
extern uv_mutex_t gc_perm_lock;
509523
extern uv_mutex_t gc_threads_lock;
510524
extern uv_cond_t gc_threads_cond;

test/gc.jl

+11
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ function issue_54275_test()
4949
@test !live_bytes_has_grown_too_much
5050
end
5151

52+
function full_sweep_reasons_test()
53+
GC.gc()
54+
reasons = Base.full_sweep_reasons()
55+
@test reasons[:FULL_SWEEP_REASON_FORCED_FULL_SWEEP] >= 1
56+
@test keys(reasons) == Set(Base.FULL_SWEEP_REASONS)
57+
end
58+
5259
# !!! note:
5360
# Since we run our tests on 32bit OS as well we confine ourselves
5461
# to parameters that allocate about 512MB of objects. Max RSS is lower
@@ -73,6 +80,10 @@ end
7380
@test isempty(Docs.undocumented_names(GC))
7481
end
7582

83+
@testset "Full GC reasons" begin
84+
full_sweep_reasons_test()
85+
end
86+
7687
#testset doesn't work here because this needs to run in top level
7788
#Check that we ensure objects in toplevel exprs are rooted
7889
global dims54422 = [] # allocate the Binding

0 commit comments

Comments
 (0)