JuliaGPU
diff --git a/‎Project.toml
+1-1 b/‎Project.toml
+1-1
diff --git a/‎docs/make.jl
+1-3 b/‎docs/make.jl
+1-3
diff --git a/‎docs/src/assets/gc-vram-breakdown.png
47.6 KB b/‎docs/src/assets/gc-vram-breakdown.png
47.6 KB
diff --git a/‎docs/src/assets/with-caching-allocator.png
6.48 KB b/‎docs/src/assets/with-caching-allocator.png
6.48 KB
diff --git a/‎docs/src/assets/without-caching-allocator.png
6.97 KB b/‎docs/src/assets/without-caching-allocator.png
6.97 KB
diff --git a/‎docs/src/caching_allocator.md
+76 b/‎docs/src/caching_allocator.md
+76
diff --git a/‎docs/src/execution_control.md
-27 b/‎docs/src/execution_control.md
-27
diff --git a/‎src/AMDGPU.jl
+1-1 b/‎src/AMDGPU.jl
+1-1
diff --git a/‎src/array.jl
+18-5 b/‎src/array.jl
+18-5
diff --git a/‎src/caching_allocator.jl
+151 b/‎src/caching_allocator.jl
+151
diff --git a/‎src/memory_record.jl
-48 b/‎src/memory_record.jl
-48
@@ -1,7 +1,7 @@
 name = "AMDGPU"
 uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>", "Valentin Churavy <v.churavy@gmail.com>", "Anton Smirnov <tonysmn97@gmail.com>"]
-version = "1.1.3"
+version = "1.1.4"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 
@@ -27,10 +27,8 @@ function main()
             "Exceptions" => "exceptions.md",
             "Profiling" => "profiling.md",
             "Memory" => "memory.md",
+            "Caching Memory Allocator" => "caching_allocator.md",
             "Host-Call" => "hostcall.md",
-            "Intrinsics" => [
-                "Execution Control" => "execution_control.md",
-            ],
             "Printing" => "printing.md",
             "Logging" => "logging.md",
             "API Reference" => "api.md"
 
@@ -0,0 +1,76 @@
+# Caching Memory Allocator
+
+Julia uses Garbage-Collection (GC) for automatic memory management.
+However, it does not know about other memory spaces,
+therefore it sees no difference between 1 KiB GPU allocation and 1 GiB
+and doesn't free it in time.
+
+This leads to a situations where all of the GPU memory is used,
+even though your algorithm only requires a fraction of it.
+
+Current mechanism of dealing with OOM (Out-Of-Memory) errors during allocations
+is to manually trigger GC and retry allocating again doing this in several rounds
+each more aggressive than previous.
+
+However, manually triggering GC is very expensive, since it requires scanning
+all Julia objects, not just ROCArrays, so the actual memory freeing takes a
+fraction of GC time:
+![](./assets/gc-vram-breakdown.png)
+
+On the image above, red region is a call to GC and green region is
+where actual GPU memory is being freed.
+
+---
+
+To help with memory management, we can use caching memory allocator.
+It is usefult in scenarios where we execute the same function multiple times
+and have the same memory allocation pattern.
+One such example is training DL models, where given the model and its parameters
+we compute loss, gradients w.r.t. loss and perform in-place parameter update.
+In this case, every iteration performs same operations and memory allocations
+and with caching allocator we can efficiently re-use them without returning
+the memory back to OS.
+
+## Example
+
+We have a for-loop, where each iteration requires 2 GiB of VRAM.
+We create a caching allocator with the name `:loop` and pass a function to
+execute.
+First iteration will allocate, but subsequent won't.
+
+```julia
+using AMDGPU
+
+function main()
+    n = 1024^2 * 256
+    for i in 1:1000
+        AMDGPU.with_caching_allocator(:loop, n) do n
+            sin.(AMDGPU.rand(Float32, n)) # 2 GiB allocation
+            return
+        end
+    end
+end
+```
+
+The reason for marking a region of code where to re-use the memory and
+not extending it to the whole program instead, is because we cannot rely on GC
+to tell us when the memory is no longer used (it is too slow for that),
+so we create such region manually.
+
+You can free all memory held by allocator, by invalidating it using its name
+with [`AMDGPU.invalidate_caching_allocator!`](@ref).
+Or if you want some region of code within [`AMDGPU.with_caching_allocator`](@ref)
+to execute without relying on cache, use [`AMDGPU.with_no_caching`](@ref).
+
+||Without Caching Allocator|With Caching Allocator|
+|:---:|:---:|:---:|
+|VRAM Usage|![](./assets/without-caching-allocator.png)|![](./assets/with-caching-allocator.png)|
+|Execution time (seconds)|`12.865149`|`0.020943`|
+
+## API
+
+```@docs
+AMDGPU.with_caching_allocator
+AMDGPU.with_no_caching
+AMDGPU.invalidate_caching_allocator!
+```
@@ -114,7 +114,7 @@ include("tls.jl")
 include("highlevel.jl")
 include("reflection.jl")
 include("array.jl")
-include("memory_record.jl")
+include("caching_allocator.jl")
 include("conversions.jl")
 include("broadcast.jl")
 include("exception_handler.jl")
 
@@ -7,11 +7,24 @@ mutable struct ROCArray{T, N, B} <: AbstractGPUArray{T, N}
         ::UndefInitializer, dims::Dims{N},
     ) where {T, N, B <: Mem.AbstractAMDBuffer}
         @assert isbitstype(T) "ROCArray only supports bits types"
-        data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
-        x = new{T, N, B}(data, dims, 0)
-        x = finalizer(unsafe_free!, x)
-        RECORD_MEMORY[] && record!(x)
-        return x
+
+        alloc_name = cache_alloc_name()
+        # Do not use caching allocator if it is not set or
+        # the buffer is not a device memory.
+        x = if !(B <: Mem.HIPBuffer) || alloc_name == :none
+            data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
+            x = new{T, N, B}(data, dims, 0)
+        else
+            alloc = cache_allocator!(alloc_name)
+            tmp = alloc!(alloc, B, T, dims)
+            if tmp ≡ nothing
+                data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
+                tmp = new{T, N, B}(data, dims, 0)
+                add_busy!(alloc, tmp)
+            end
+            tmp::ROCArray{T, N, B}
+        end
+        return finalizer(unsafe_free!, x)
     end
 
     function ROCArray{T, N}(
 
@@ -0,0 +1,151 @@
+# NOTE: EXPERIMENTAL API.
+
+struct CacheAllocator
+    lock::ReentrantLock
+    busy::Dict{UInt64, Vector{ROCArray}} # hash((T, dims)) => ROCArray[]
+    free::Dict{UInt64, Vector{ROCArray}}
+end
+
+CacheAllocator() = CacheAllocator(
+    ReentrantLock(),
+    Dict{UInt64, Vector{ROCArray}}(),
+    Dict{UInt64, Vector{ROCArray}}(),
+)
+
+const CACHE_ALLOCS::LockedObject{Dict{Symbol, CacheAllocator}} =
+    LockedObject(Dict{Symbol, CacheAllocator}())
+
+function cache_allocator!(cache_name::Symbol)
+    allocs = CACHE_ALLOCS.payload
+    alloc = get(allocs, cache_name, nothing)
+    alloc ≡ nothing || return alloc
+
+    return Base.@lock CACHE_ALLOCS.lock begin
+        allocs[cache_name] = CacheAllocator()
+    end
+end
+
+function get_free_pool(alloc::CacheAllocator, uid)
+    free_pool = get(alloc.free, uid, nothing)
+    if free_pool ≡ nothing
+        free_pool = Base.@lock alloc.lock alloc.free[uid] = ROCArray[]
+    end
+    return free_pool
+end
+
+function get_busy_pool(alloc::CacheAllocator, uid)
+    busy_pool = get(alloc.busy, uid, nothing)
+    if busy_pool ≡ nothing
+        busy_pool = Base.@lock alloc.lock alloc.busy[uid] = ROCArray[]
+    end
+    return busy_pool
+end
+
+function alloc!(
+    alloc::CacheAllocator, ::Type{Mem.HIPBuffer}, ::Type{T}, dims::Dims{N},
+)::Maybe{ROCArray{T, N, Mem.HIPBuffer}} where {T, N}
+    uid = hash((T, dims))
+    free_pool = get_free_pool(alloc, uid)
+    isempty(free_pool) && return nothing
+
+    # @info "Cache hit"
+    busy_pool = get_busy_pool(alloc, uid)
+    x = pop!(free_pool)
+    # Array was manually freed via `unsafe_free!`.
+    x.buf.freed && return nothing
+
+    push!(busy_pool, x)
+    return x
+end
+
+# Mark `x` array as busy, used during cache misses to add new allocations.
+function add_busy!(alloc::CacheAllocator, x::ROCArray{T}) where T
+    uid = hash((T, size(x)))
+    busy_pool = get_busy_pool(alloc, uid)
+    Base.@lock alloc.lock push!(busy_pool, x)
+    return
+end
+
+function free_busy!(alloc::CacheAllocator)
+    for uid in alloc.busy.keys
+        free_pool = get_free_pool(alloc, uid)
+        busy_pool = get_busy_pool(alloc, uid)
+        isempty(busy_pool) && continue
+
+        Base.@lock alloc.lock begin
+            append!(free_pool, busy_pool)
+            empty!(busy_pool)
+        end
+    end
+end
+
+# Public API.
+
+"""
+    with_caching_allocator(f, alloc_name::Symbol, args...)
+
+Execute function `f` with arguments `args...` using
+caching allocator given by its name `alloc_name`.
+
+All GPU memory allocations will attempt to hit this cache
+before doing actual allocation (in case of cache miss).
+After executing `f`, all "busy" memory within the allocator is marked as free,
+so it can be re-used with the next call.
+
+# Returns
+
+Result of the `f` function.
+"""
+function with_caching_allocator(f, alloc_name::Symbol, args...)
+    alloc = cache_allocator!(alloc_name)
+    # Enable usage of cache allocator during allocations.
+    cache_alloc_name!(alloc_name)
+    res = f(args...)
+    # Mark all allocations during `f` as free to re-use and disable allocator.
+    free_busy!(alloc)
+    cache_alloc_name!(:none)
+    return res
+end
+
+"""
+    with_no_caching(f)
+
+Execute function `f`, but avoid hitting any caching allocator.
+This is useful to call from within [`with_caching_allocator`](@ref),
+so that the memory is independent from it.
+
+# Returns
+
+Result of the `f` function.
+"""
+function with_no_caching(f)
+    alloc_name = cache_alloc_name()
+    cache_alloc_name!(:none)
+    res = f()
+    cache_alloc_name!(alloc_name)
+    return res
+end
+
+"""
+    invalidate_caching_allocator!(alloc_name::Symbol)
+
+Free all memory held by caching allocator given by it name `alloc_name`.
+"""
+function invalidate_caching_allocator!(alloc_name::Symbol)
+    alloc = cache_allocator!(alloc_name)
+    alloc ≡ nothing && return
+
+    Base.@lock alloc.lock begin
+        for (_, pool) in alloc.free
+            map(AMDGPU.unsafe_free!, pool)
+        end
+        # TODO is other threads use the same, signal that it is invalidated somehow?
+        # TODO error if pool is in use, i.e. non empty `busy`?
+        for (_, pool) in alloc.busy
+            map(AMDGPU.unsafe_free!, pool)
+        end
+        empty!(alloc.busy)
+        empty!(alloc.free)
+    end
+    return
+end