Skip to content

Commit fba207f

Browse files
authored
Add caching memory allocator (#708)
1 parent 132deaf commit fba207f

12 files changed

+265
-104
lines changed

Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "AMDGPU"
22
uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
33
authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>", "Valentin Churavy <v.churavy@gmail.com>", "Anton Smirnov <tonysmn97@gmail.com>"]
4-
version = "1.1.3"
4+
version = "1.1.4"
55

66
[deps]
77
AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"

docs/make.jl

+1-3
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,8 @@ function main()
2727
"Exceptions" => "exceptions.md",
2828
"Profiling" => "profiling.md",
2929
"Memory" => "memory.md",
30+
"Caching Memory Allocator" => "caching_allocator.md",
3031
"Host-Call" => "hostcall.md",
31-
"Intrinsics" => [
32-
"Execution Control" => "execution_control.md",
33-
],
3432
"Printing" => "printing.md",
3533
"Logging" => "logging.md",
3634
"API Reference" => "api.md"

docs/src/assets/gc-vram-breakdown.png

47.6 KB
Loading
6.48 KB
Loading
6.97 KB
Loading

docs/src/caching_allocator.md

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Caching Memory Allocator
2+
3+
Julia uses Garbage-Collection (GC) for automatic memory management.
4+
However, it does not know about other memory spaces,
5+
therefore it sees no difference between 1 KiB GPU allocation and 1 GiB
6+
and doesn't free it in time.
7+
8+
This leads to a situations where all of the GPU memory is used,
9+
even though your algorithm only requires a fraction of it.
10+
11+
Current mechanism of dealing with OOM (Out-Of-Memory) errors during allocations
12+
is to manually trigger GC and retry allocating again doing this in several rounds
13+
each more aggressive than previous.
14+
15+
However, manually triggering GC is very expensive, since it requires scanning
16+
all Julia objects, not just ROCArrays, so the actual memory freeing takes a
17+
fraction of GC time:
18+
![](./assets/gc-vram-breakdown.png)
19+
20+
On the image above, red region is a call to GC and green region is
21+
where actual GPU memory is being freed.
22+
23+
---
24+
25+
To help with memory management, we can use caching memory allocator.
26+
It is usefult in scenarios where we execute the same function multiple times
27+
and have the same memory allocation pattern.
28+
One such example is training DL models, where given the model and its parameters
29+
we compute loss, gradients w.r.t. loss and perform in-place parameter update.
30+
In this case, every iteration performs same operations and memory allocations
31+
and with caching allocator we can efficiently re-use them without returning
32+
the memory back to OS.
33+
34+
## Example
35+
36+
We have a for-loop, where each iteration requires 2 GiB of VRAM.
37+
We create a caching allocator with the name `:loop` and pass a function to
38+
execute.
39+
First iteration will allocate, but subsequent won't.
40+
41+
```julia
42+
using AMDGPU
43+
44+
function main()
45+
n = 1024^2 * 256
46+
for i in 1:1000
47+
AMDGPU.with_caching_allocator(:loop, n) do n
48+
sin.(AMDGPU.rand(Float32, n)) # 2 GiB allocation
49+
return
50+
end
51+
end
52+
end
53+
```
54+
55+
The reason for marking a region of code where to re-use the memory and
56+
not extending it to the whole program instead, is because we cannot rely on GC
57+
to tell us when the memory is no longer used (it is too slow for that),
58+
so we create such region manually.
59+
60+
You can free all memory held by allocator, by invalidating it using its name
61+
with [`AMDGPU.invalidate_caching_allocator!`](@ref).
62+
Or if you want some region of code within [`AMDGPU.with_caching_allocator`](@ref)
63+
to execute without relying on cache, use [`AMDGPU.with_no_caching`](@ref).
64+
65+
||Without Caching Allocator|With Caching Allocator|
66+
|:---:|:---:|:---:|
67+
|VRAM Usage|![](./assets/without-caching-allocator.png)|![](./assets/with-caching-allocator.png)|
68+
|Execution time (seconds)|`12.865149`|`0.020943`|
69+
70+
## API
71+
72+
```@docs
73+
AMDGPU.with_caching_allocator
74+
AMDGPU.with_no_caching
75+
AMDGPU.invalidate_caching_allocator!
76+
```

docs/src/execution_control.md

-27
This file was deleted.

src/AMDGPU.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ include("tls.jl")
114114
include("highlevel.jl")
115115
include("reflection.jl")
116116
include("array.jl")
117-
include("memory_record.jl")
117+
include("caching_allocator.jl")
118118
include("conversions.jl")
119119
include("broadcast.jl")
120120
include("exception_handler.jl")

src/array.jl

+18-5
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,24 @@ mutable struct ROCArray{T, N, B} <: AbstractGPUArray{T, N}
77
::UndefInitializer, dims::Dims{N},
88
) where {T, N, B <: Mem.AbstractAMDBuffer}
99
@assert isbitstype(T) "ROCArray only supports bits types"
10-
data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
11-
x = new{T, N, B}(data, dims, 0)
12-
x = finalizer(unsafe_free!, x)
13-
RECORD_MEMORY[] && record!(x)
14-
return x
10+
11+
alloc_name = cache_alloc_name()
12+
# Do not use caching allocator if it is not set or
13+
# the buffer is not a device memory.
14+
x = if !(B <: Mem.HIPBuffer) || alloc_name == :none
15+
data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
16+
x = new{T, N, B}(data, dims, 0)
17+
else
18+
alloc = cache_allocator!(alloc_name)
19+
tmp = alloc!(alloc, B, T, dims)
20+
if tmp nothing
21+
data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
22+
tmp = new{T, N, B}(data, dims, 0)
23+
add_busy!(alloc, tmp)
24+
end
25+
tmp::ROCArray{T, N, B}
26+
end
27+
return finalizer(unsafe_free!, x)
1528
end
1629

1730
function ROCArray{T, N}(

src/caching_allocator.jl

+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# NOTE: EXPERIMENTAL API.
2+
3+
struct CacheAllocator
4+
lock::ReentrantLock
5+
busy::Dict{UInt64, Vector{ROCArray}} # hash((T, dims)) => ROCArray[]
6+
free::Dict{UInt64, Vector{ROCArray}}
7+
end
8+
9+
CacheAllocator() = CacheAllocator(
10+
ReentrantLock(),
11+
Dict{UInt64, Vector{ROCArray}}(),
12+
Dict{UInt64, Vector{ROCArray}}(),
13+
)
14+
15+
const CACHE_ALLOCS::LockedObject{Dict{Symbol, CacheAllocator}} =
16+
LockedObject(Dict{Symbol, CacheAllocator}())
17+
18+
function cache_allocator!(cache_name::Symbol)
19+
allocs = CACHE_ALLOCS.payload
20+
alloc = get(allocs, cache_name, nothing)
21+
alloc nothing || return alloc
22+
23+
return Base.@lock CACHE_ALLOCS.lock begin
24+
allocs[cache_name] = CacheAllocator()
25+
end
26+
end
27+
28+
function get_free_pool(alloc::CacheAllocator, uid)
29+
free_pool = get(alloc.free, uid, nothing)
30+
if free_pool nothing
31+
free_pool = Base.@lock alloc.lock alloc.free[uid] = ROCArray[]
32+
end
33+
return free_pool
34+
end
35+
36+
function get_busy_pool(alloc::CacheAllocator, uid)
37+
busy_pool = get(alloc.busy, uid, nothing)
38+
if busy_pool nothing
39+
busy_pool = Base.@lock alloc.lock alloc.busy[uid] = ROCArray[]
40+
end
41+
return busy_pool
42+
end
43+
44+
function alloc!(
45+
alloc::CacheAllocator, ::Type{Mem.HIPBuffer}, ::Type{T}, dims::Dims{N},
46+
)::Maybe{ROCArray{T, N, Mem.HIPBuffer}} where {T, N}
47+
uid = hash((T, dims))
48+
free_pool = get_free_pool(alloc, uid)
49+
isempty(free_pool) && return nothing
50+
51+
# @info "Cache hit"
52+
busy_pool = get_busy_pool(alloc, uid)
53+
x = pop!(free_pool)
54+
# Array was manually freed via `unsafe_free!`.
55+
x.buf.freed && return nothing
56+
57+
push!(busy_pool, x)
58+
return x
59+
end
60+
61+
# Mark `x` array as busy, used during cache misses to add new allocations.
62+
function add_busy!(alloc::CacheAllocator, x::ROCArray{T}) where T
63+
uid = hash((T, size(x)))
64+
busy_pool = get_busy_pool(alloc, uid)
65+
Base.@lock alloc.lock push!(busy_pool, x)
66+
return
67+
end
68+
69+
function free_busy!(alloc::CacheAllocator)
70+
for uid in alloc.busy.keys
71+
free_pool = get_free_pool(alloc, uid)
72+
busy_pool = get_busy_pool(alloc, uid)
73+
isempty(busy_pool) && continue
74+
75+
Base.@lock alloc.lock begin
76+
append!(free_pool, busy_pool)
77+
empty!(busy_pool)
78+
end
79+
end
80+
end
81+
82+
# Public API.
83+
84+
"""
85+
with_caching_allocator(f, alloc_name::Symbol, args...)
86+
87+
Execute function `f` with arguments `args...` using
88+
caching allocator given by its name `alloc_name`.
89+
90+
All GPU memory allocations will attempt to hit this cache
91+
before doing actual allocation (in case of cache miss).
92+
After executing `f`, all "busy" memory within the allocator is marked as free,
93+
so it can be re-used with the next call.
94+
95+
# Returns
96+
97+
Result of the `f` function.
98+
"""
99+
function with_caching_allocator(f, alloc_name::Symbol, args...)
100+
alloc = cache_allocator!(alloc_name)
101+
# Enable usage of cache allocator during allocations.
102+
cache_alloc_name!(alloc_name)
103+
res = f(args...)
104+
# Mark all allocations during `f` as free to re-use and disable allocator.
105+
free_busy!(alloc)
106+
cache_alloc_name!(:none)
107+
return res
108+
end
109+
110+
"""
111+
with_no_caching(f)
112+
113+
Execute function `f`, but avoid hitting any caching allocator.
114+
This is useful to call from within [`with_caching_allocator`](@ref),
115+
so that the memory is independent from it.
116+
117+
# Returns
118+
119+
Result of the `f` function.
120+
"""
121+
function with_no_caching(f)
122+
alloc_name = cache_alloc_name()
123+
cache_alloc_name!(:none)
124+
res = f()
125+
cache_alloc_name!(alloc_name)
126+
return res
127+
end
128+
129+
"""
130+
invalidate_caching_allocator!(alloc_name::Symbol)
131+
132+
Free all memory held by caching allocator given by it name `alloc_name`.
133+
"""
134+
function invalidate_caching_allocator!(alloc_name::Symbol)
135+
alloc = cache_allocator!(alloc_name)
136+
alloc nothing && return
137+
138+
Base.@lock alloc.lock begin
139+
for (_, pool) in alloc.free
140+
map(AMDGPU.unsafe_free!, pool)
141+
end
142+
# TODO is other threads use the same, signal that it is invalidated somehow?
143+
# TODO error if pool is in use, i.e. non empty `busy`?
144+
for (_, pool) in alloc.busy
145+
map(AMDGPU.unsafe_free!, pool)
146+
end
147+
empty!(alloc.busy)
148+
empty!(alloc.free)
149+
end
150+
return
151+
end

src/memory_record.jl

-48
This file was deleted.

0 commit comments

Comments
 (0)