Skip to content

Commit 36bfc53

Browse files
authored
Merge pull request #413 from JuliaGPU/pxl-th/miopen-workspace
Fix huge MIOpen workspace allocations
2 parents 87d966f + e2ee719 commit 36bfc53

File tree

4 files changed

+36
-51
lines changed

4 files changed

+36
-51
lines changed

src/array.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ mutable struct ROCArray{T,N} <: AbstractGPUArray{T,N}
6363
syncstate::Runtime.SyncState
6464

6565
function ROCArray{T,N}(
66-
buf::Mem.Buffer, dims::Dims{N};
67-
offset::Integer = 0, syncstate = Runtime.SyncState(),
66+
buf::Mem.Buffer, dims::Dims{N}; offset::Integer = 0,
67+
syncstate::Runtime.SyncState = Runtime.SyncState(),
6868
) where {T,N}
6969
@assert isbitstype(T) "ROCArray only supports bits types"
7070
xs = new{T,N}(buf, dims, offset, syncstate)

src/dnn/convolution.jl

+2-7
Original file line numberDiff line numberDiff line change
@@ -98,16 +98,11 @@ function find_algorithm(
9898
cache = get_benchmark_cache(conv_type, conv_args, dev)
9999
isnothing(cache) || return cache
100100

101-
is_fwd = conv_type == Type{miopenConvFwdAlgorithm_t}
102-
workspace_size = get_workspace_size(conv_type;
103-
handle,
104-
a_desc=(is_fwd ? b_desc : a_desc),
105-
b_desc=(is_fwd ? a_desc : b_desc), conv_desc, c_desc)
106-
107-
workspace = Workspace(dev, workspace_size)
101+
workspace = Workspace(dev, 0)
108102
perf_results = find_conv_algo(conv_type;
109103
handle, workspace, a, a_desc, b, b_desc, conv_desc, c, c_desc)
110104
set_benchmark_cache!(conv_type, conv_args, perf_results)
105+
workspace = Workspace(dev, perf_results.memory)
111106

112107
perf_results, workspace
113108
end

src/runtime/kernel.jl

+2-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ end
3737
function ROCModule(exe::ROCExecutable)
3838
device = exe.device
3939
metadata = KernelMetadata[]
40-
exceptions = Mem.alloc(device, sizeof(AMDGPU.Device.ExceptionEntry)*MAX_EXCEPTIONS; coherent=true)
40+
bytesize = sizeof(AMDGPU.Device.ExceptionEntry) * MAX_EXCEPTIONS
41+
exceptions = Mem.alloc(device, bytesize; coherent=true)
4142

4243
mod = ROCModule(exe, metadata, exceptions)
4344
EXE_TO_MODULE_MAP[exe] = WeakRef(mod)

src/runtime/memory.jl

+30-41
Original file line numberDiff line numberDiff line change
@@ -336,11 +336,7 @@ function alloc(device::ROCDevice, bytesize::Integer; coherent=false, slow_fallba
336336

337337
bytesize == 0 && return Buffer(C_NULL, C_NULL, C_NULL, 0, device, coherent, false)
338338

339-
region_kind = if coherent
340-
:finegrained
341-
else
342-
:coarsegrained
343-
end
339+
region_kind = coherent ? :finegrained : :coarsegrained
344340

345341
buf = nothing
346342
region = nothing
@@ -403,52 +399,45 @@ function alloc_or_retry!(f)
403399
end
404400

405401
const ALL_ALLOCS = Threads.Atomic{Int64}(0)
406-
function alloc(device::ROCDevice, pool::ROCMemoryPool, bytesize::Integer)
407-
if ALL_ALLOCS[] + bytesize > MEMORY_ALLOC_LIMIT
408-
check(HSA.STATUS_ERROR_OUT_OF_RESOURCES)
409-
end
402+
403+
_alloc(p::ROCMemoryPool, bytesize::Integer, ptr_ref) = HSA.amd_memory_pool_allocate(p.pool, bytesize, 0, ptr_ref)
404+
_alloc(p::ROCMemoryRegion, bytesize::Integer, ptr_ref) = HSA.memory_allocate(p.region, bytesize, ptr_ref)
405+
406+
_accessible(p::ROCMemoryRegion)::Bool = Runtime.region_host_accessible(p)
407+
_accessible(p::ROCMemoryPool)::Bool = Runtime.pool_accessible_by_all(p)
408+
409+
function alloc(
410+
device::ROCDevice, space::S, bytesize::Integer,
411+
) where S <: Union{ROCMemoryPool, ROCMemoryRegion}
410412
ptr_ref = Ref{Ptr{Cvoid}}()
411-
alloc_or_retry!() do
412-
HSA.amd_memory_pool_allocate(pool.pool, bytesize, 0, ptr_ref)
413-
end
414-
Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize))
415-
AMDGPU.hsaref!()
413+
alloc_or_retry!(() -> _alloc(space, bytesize, ptr_ref))
416414
ptr = ptr_ref[]
417-
return Buffer(ptr, C_NULL, ptr, Int64(bytesize), device, Runtime.pool_accessible_by_all(pool), true)
418-
end
419-
function alloc(device::ROCDevice, region::ROCMemoryRegion, bytesize::Integer)
420-
if ALL_ALLOCS[] + bytesize > MEMORY_ALLOC_LIMIT
421-
check(HSA.STATUS_ERROR_OUT_OF_RESOURCES)
422-
end
423-
ptr_ref = Ref{Ptr{Cvoid}}()
424-
alloc_or_retry!() do
425-
HSA.memory_allocate(region.region, bytesize, ptr_ref)
426-
end
427-
Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize))
428415
AMDGPU.hsaref!()
429-
ptr = ptr_ref[]
430-
return Buffer(ptr, C_NULL, ptr, Int64(bytesize), device, Runtime.region_host_accessible(region), false)
416+
Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize))
417+
Buffer(ptr, C_NULL, ptr, Int64(bytesize), device, _accessible(space), S <: ROCMemoryPool)
431418
end
419+
432420
alloc(bytesize; kwargs...) = alloc(AMDGPU.device(), bytesize; kwargs...)
433421

434422
@static if AMDGPU.hip_configured
435-
function alloc_hip(bytesize::Integer)
436-
ptr_ref = Ref{Ptr{Cvoid}}()
437-
# FIXME: Set HIP device
438-
alloc_or_retry!() do
439-
try
440-
HIP.@check HIP.hipMalloc(ptr_ref, Csize_t(bytesize))
441-
HSA.STATUS_SUCCESS
442-
catch
443-
# FIXME: Actually check error code
444-
HSA.STATUS_ERROR_OUT_OF_RESOURCES
423+
function alloc_hip(bytesize::Integer)
424+
ptr_ref = Ref{Ptr{Cvoid}}()
425+
# FIXME: Set HIP device
426+
alloc_or_retry!() do
427+
try
428+
HIP.@check HIP.hipMalloc(ptr_ref, Csize_t(bytesize))
429+
HSA.STATUS_SUCCESS
430+
catch
431+
# FIXME: Actually check error code
432+
HSA.STATUS_ERROR_OUT_OF_RESOURCES
433+
end
445434
end
435+
AMDGPU.hsaref!()
436+
ptr = ptr_ref[]
437+
Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize))
438+
Buffer(ptr, C_NULL, ptr, Int64(bytesize), AMDGPU.device(), false, true)
446439
end
447-
AMDGPU.hsaref!()
448-
ptr = ptr_ref[]
449-
return Buffer(ptr, C_NULL, ptr, Int64(bytesize), AMDGPU.device(), false, true)
450440
end
451-
end # if AMDGPU.hip_configured
452441

453442
function free(buf::Buffer)
454443
buf.ptr == C_NULL && return

0 commit comments

Comments
 (0)