Merge pull request #258 from JuliaGPU/jps/queue-priority

jpsamaroo · web-flow · commit 8d0ba06f3592 · 2022-07-29T16:15:08.000-05:00
queue: Allow setting priority from ctor
diff --git a/docs/src/queues_signals.md b/docs/src/queues_signals.md
@@ -2,13 +2,13 @@
 
 Similar to CUDA streams, ROCm has the concept of queues, which are
 buffers used to instruct the GPU hardware which kernels to launch. ROCm queues
-are asynchronous, unlike CUDA streams. Each agent has a default queue
-associated, which is accessible with `get_default_queue(agent)` (or
-`get_default_queue()` for the default agent's default queue). You can specify
+are asynchronous, unlike CUDA streams. Each device has a default queue
+associated, which is accessible with `default_queue(device)` (or
+`default_queue()` for the default device's default queue). You can specify
 which queue to launch a kernel on with the `queue` argument to `@roc`:
 
 ```julia
-q = AMDGPU.ROCQueue(agent)
+q = AMDGPU.ROCQueue(device)
 @roc queue=q kernel(...)
 ```
 
@@ -18,19 +18,29 @@ which can be inspected to determine how many (and which) kernels are executing
 by comparing the signals returned from `@roc`. You can also omit the `queue`
 argument, which will then check the default queue.
 
-If a kernel ever gets "stuck" and locks up the GPU (noticeable with 100% GPU
-usage in `rocm-smi`), you can kill the kernel and all other kernels in the
+Sometimes a kernel ever gets "stuck" and locks up the GPU (noticeable with 100%
+GPU usage in `rocm-smi`); you can kill the kernel and all other kernels in the
 queue with `kill_queue!(queue)`. This can be "safely" done to the default
 queue, since default queues are recreated as-needed.
 
+Queues also have an inherent priority, which allows control of kernel
+submission latency and on-device scheduling preference with respect to kernels
+submitted on other queues. There are three priorities: normal (the default), low, and high priority. These can be easily set at queue creation time:
+
+```julia
+low_prio_queue = ROCQueue(device; priority=:low)
+high_prio_queue = ROCQueue(device; priority=:high)
+normal_prio_queue = ROCQueue(device; priority=:normal) # or just omit "priority"
+```
+
 # Signals
 
 Unlike CUDA, ROCm kernels are tracked by an associated signal, which is
 created and returned by `@roc`, and is `wait`ed on to track kernel completion.
 Signals may also be used for manual synchronization (since they work for CPUs
 and GPUs equally well). CPU usage is done with the `HSA.signal_*` functions,
-and GPU usage is done with the `device_signal_*` functions. For most signalling
-needs, consider using a hostcall instead.
+and GPU usage is done with the `device_signal_*` and `hostcall_device_signal_*`
+functions. For most signalling needs, consider using a hostcall instead.
 
 If custom signal handling is desired, signals can be manually constructed and
 passed to `@roc`:
@@ -39,7 +49,7 @@ passed to `@roc`:
 # A kernel which waits on all signals in `sigs`
 function multi_wait(sigs)
     for i in 1:length(sigs)
-        AMDGPU.device_signal_wait(sigs[i], 0)
+        AMDGPU.Device.hostcall_device_signal_wait(sigs[i], 0)
     end
     nothing
 end
diff --git a/src/queue.jl b/src/queue.jl
@@ -26,7 +26,7 @@ function queue_error_handler(status::HSA.Status, _queue::Ptr{HSA.Queue}, queue_o
     return nothing
 end
 
-function ROCQueue(device::ROCDevice)
+function ROCQueue(device::ROCDevice; priority::Symbol=:normal)
     queue_size = Ref{UInt32}(0)
     getinfo(device.agent, HSA.AGENT_INFO_QUEUE_MAX_SIZE, queue_size) |> check
     @assert queue_size[] > 0
@@ -49,7 +49,6 @@ function ROCQueue(device::ROCDevice)
     end
 
     # Monitor queue for async errors
-    # TODO: errormonitor
     queue_ptr = queue.queue
     errormonitor(Threads.@spawn begin
         try
@@ -72,6 +71,21 @@ function ROCQueue(device::ROCDevice)
         end
     end)
 
+    # Set queue priority
+    if !in(priority, (:normal, :low, :high))
+        throw(ArgumentError("Invalid queue priority: $priority\nOptions are :low, :normal, :high"))
+    end
+    if priority != :normal
+        hsa_prio = if priority == :normal
+            HSA.AMD_QUEUE_PRIORITY_NORMAL
+        elseif priority == :low
+            HSA.AMD_QUEUE_PRIORITY_LOW
+        elseif priority == :high
+            HSA.AMD_QUEUE_PRIORITY_HIGH
+        end
+        HSA.amd_queue_set_priority(queue_ptr, hsa_prio) |> check
+    end
+
     AMDGPU.hsaref!()
     finalizer(queue) do queue
         kill_queue!(queue)
diff --git a/test/hsa/queue.jl b/test/hsa/queue.jl
@@ -0,0 +1,10 @@
+@testset "Queues" begin
+    @testset "Priorities" begin
+        device = AMDGPU.default_device()
+        # Test that priorities can be set
+        for priority in (:low, :normal, :high)
+            ROCQueue(device; priority)
+        end
+        @test_throws ArgumentError ROCQueue(device; priority=:fake)
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -56,6 +56,7 @@ end
     @info "Testing using device $(AMDGPU.default_device())"
 
     include("hsa/device.jl")
+    include("hsa/queue.jl")
     include("hsa/memory.jl")
 end
 @testset "Codegen" begin