Skip to content

Commit 9e1041b

Browse files
authored
Merge pull request #254 from JuliaGPU/jps/export-indexing-intrinsics
Re-export indexing intrinsics
2 parents 1e76728 + 94a3854 commit 9e1041b

13 files changed

+42
-44
lines changed

docs/src/api.md

+7-7
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,16 @@ AMDGPU.rocfunction
1616
#### HSA nomenclature
1717

1818
```@docs
19-
AMDGPU.Device.workitemIdx
20-
AMDGPU.Device.workgroupIdx
21-
AMDGPU.Device.workgroupDim
22-
AMDGPU.Device.gridDim
23-
AMDGPU.Device.gridDimWG
19+
AMDGPU.workitemIdx
20+
AMDGPU.workgroupIdx
21+
AMDGPU.workgroupDim
22+
AMDGPU.gridItemDim
23+
AMDGPU.gridGroupDim
2424
```
2525

2626
#### CUDA nomenclature
2727

28-
Use these functions for compatibility with CUDAnative.jl.
28+
Use these functions for compatibility with CUDA.jl.
2929

3030
```@docs
3131
AMDGPU.Device.threadIdx
@@ -36,7 +36,7 @@ AMDGPU.Device.blockDim
3636
### Synchronization
3737

3838
```@docs
39-
AMDGPU.Device.sync_workgroup
39+
AMDGPU.sync_workgroup
4040
```
4141

4242
### Global Variables

docs/src/quickstart.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -116,13 +116,13 @@ wait(@roc groupsize=N vadd!(c_d, a_d, b_d))
116116
| [`workitemIdx`](@ref) | [`threadIdx`](@ref) |
117117
| [`workgroupIdx`](@ref) | [`blockIdx`](@ref) |
118118
| [`workgroupDim`](@ref) | [`blockDim`](@ref) |
119-
| [`gridDim`](@ref) | No equivalent |
120-
| [`gridDimWG`](@ref) | `gridDim` |
119+
| [`gridItemDim`](@ref) | No equivalent |
120+
| [`gridGroupDim`](@ref) | `gridDim` |
121121
| `groupsize` | `threads` |
122122
| `gridsize` | `blocks * threads` |
123123
| `queue` | `stream` |
124124

125-
For compatibilty reasons, the symbols in the CUDAnative column (except for `gridDim`) are also supported by AMDGPU.
125+
For compatibilty reasons, the symbols in the CUDA column (except for `gridItemDim`) are also supported by AMDGPU.
126126

127127
Finally, we can make sure that the results match, by first copying the data to the host and then comparing it with the CPU results:
128128

src/AMDGPU.jl

+4-1
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,15 @@ module Device
9797
end
9898
import .Device: malloc, signal_exception, report_exception, report_oom, report_exception_frame
9999
import .Device: ROCDeviceArray, AS, HostCall, hostcall!
100-
import .Device: workitemIdx, workgroupIdx, workgroupDim, gridDim, gridDimWG
100+
import .Device: workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim
101101
import .Device: threadIdx, blockIdx, blockDim
102+
import .Device: sync_workgroup
102103
import .Device: @rocprint, @rocprintln, @rocprintf
103104

104105
export ROCDeviceArray
105106
export @rocprint, @rocprintln, @rocprintf
107+
export workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim
108+
export sync_workgroup
106109

107110
module Compiler
108111
using ..GPUCompiler

src/array.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ for (f, froc) in (
2525
(:blockidx, :blockIdx),
2626
(:blockdim, :blockDim),
2727
(:threadidx, :threadIdx),
28-
(:griddim, :gridDimWG)
28+
(:griddim, :gridGroupDim)
2929
)
3030
@eval @inline GPUArrays.$f(::ROCKernelContext) = AMDGPU.$froc().x
3131
end
@@ -47,7 +47,7 @@ end
4747
# synchronization
4848

4949
@inline function GPUArrays.synchronize_threads(::ROCKernelContext)
50-
AMDGPU.sync_workgroup()
50+
sync_workgroup()
5151
return
5252
end
5353

src/deprecations.jl

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
@deprecate gridDim() gridItemDim()
2+
@deprecate gridDimWG() gridGroupDim()
13
@deprecate HSAAgent ROCDevice
24
@deprecate HSAQueue ROCQueue
35
@deprecate HSASignal ROCSignal

src/device/gcn/indexing.jl

+6-8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
# Indexing and dimensions
2-
export workitemIdx, workgroupIdx, workgroupDim, gridDim, gridDimWG
3-
export threadIdx, blockIdx, blockDim
42

53
@generated function _index(::Val{fname}, ::Val{name}, ::Val{range}) where {fname, name, range}
64
Context() do ctx
@@ -106,11 +104,11 @@ for (dim,off) in ((:x,1), (:y,2), (:z,3))
106104
@eval @inline $cufn() = $fn()
107105

108106
# Grid dimension (in workitems)
109-
fn = Symbol("gridDim_$dim")
107+
fn = Symbol("gridItemDim_$dim")
110108
base = _packet_offsets[findfirst(x->x==:grid_size_x,_packet_names)]
111109
@eval @inline $fn() = Int(_dim($(Val(base)), $(Val(off)), $(Val(0:(_max_grid_size[dim]-1))), UInt32))
112110
# Grid dimension (in workgroups)
113-
fn_wg = Symbol("gridDimWG_$dim")
111+
fn_wg = Symbol("gridGroupDim_$dim")
114112
fn_wg_dim = Symbol("workgroupDim_$dim")
115113
@eval @inline $fn_wg() = div($fn(), $fn_wg_dim())
116114
end
@@ -140,20 +138,20 @@ See also: [`blockDim`](@ref)
140138
@inline workgroupDim() = (x=workgroupDim_x(), y=workgroupDim_y(), z=workgroupDim_z())
141139

142140
"""
143-
gridDim()::ROCDim3
141+
gridItemDim()::ROCDim3
144142
145143
Returns the size of the grid in workitems.
146144
This behaviour is different from CUDA where `gridDim` gives the size of the grid in blocks.
147145
"""
148-
@inline gridDim() = (x=gridDim_x(), y=gridDim_y(), z=gridDim_z())
146+
@inline gridItemDim() = (x=gridItemDim_x(), y=gridItemDim_y(), z=gridItemDim_z())
149147

150148
"""
151-
gridDimWG()::ROCDim3
149+
gridGroupDim()::ROCDim3
152150
153151
Returns the size of the grid in workgroups.
154152
This is equivalent to CUDA's `gridDim`.
155153
"""
156-
@inline gridDimWG() = (x=gridDimWG_x(), y=gridDimWG_y(), z=gridDimWG_z())
154+
@inline gridGroupDim() = (x=gridGroupDim_x(), y=gridGroupDim_y(), z=gridGroupDim_z())
157155

158156
# For compat with CUDAnative et. al
159157

src/device/gcn/synchronization.jl

-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
export sync_workgroup
2-
31
"""
42
sync_workgroup()
53

src/mapreduce.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# perform a reduction
1717
d = items>>1
1818
while d > 0
19-
Device.sync_workgroup()
19+
sync_workgroup()
2020
if item <= d
2121
shared[item] = op(shared[item], shared[item+d])
2222
end
@@ -44,7 +44,7 @@ function partial_mapreduce_device(f, op, neutral, maxitems, Rreduce, Rother, R,
4444
localIdx_reduce = workitemIdx().x
4545
localDim_reduce = workgroupDim().x
4646
groupIdx_reduce, groupIdx_other = fldmod1(workgroupIdx().x, length(Rother))
47-
groupDim_reduce = gridDimWG().x ÷ length(Rother)
47+
groupDim_reduce = gridGroupDim().x ÷ length(Rother)
4848

4949
# group-based indexing into the values outside of the reduction dimension
5050
# (that means we can safely synchronize items within this group)

test/codegen/synchronization.jl

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
@testset "Synchronization" begin
2-
# TODO: Remove dummy argument
3-
function synckern(x)
4-
Device.sync_workgroup()
2+
function synckern()
3+
sync_workgroup()
54
nothing
65
end
76

87
iob = IOBuffer()
9-
AMDGPU.code_gcn(iob, synckern, Tuple{Int}; kernel=true)
8+
AMDGPU.code_gcn(iob, synckern, Tuple{}; kernel=true)
109
@test occursin("s_barrier", String(take!(iob)))
1110
end

test/device/indexing.jl

+7-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import .Device: workitemIdx, workgroupIdx, workgroupDim, gridDim, gridDimWG
2-
31
@testset "Kernel Indexing" begin
42

53
function idx_kern(X)
@@ -16,7 +14,7 @@ end
1614

1715
A = zeros(Int64, 6)
1816
RA = ROCArray(A)
19-
@roc groupsize=(1,2,3) gridsize=(4,5,6) idx_kern(RA)
17+
wait(@roc groupsize=(1,2,3) gridsize=(4,5,6) idx_kern(RA))
2018
A = Array(RA)
2119
@test all(A .> 0)
2220

@@ -25,13 +23,13 @@ function dim_kern(X)
2523
X[2] = workgroupDim().y
2624
X[3] = workgroupDim().z
2725

28-
X[4] = gridDim().x
29-
X[5] = gridDim().y
30-
X[6] = gridDim().z
26+
X[4] = gridItemDim().x
27+
X[5] = gridItemDim().y
28+
X[6] = gridItemDim().z
3129

32-
X[7] = gridDimWG().x
33-
X[8] = gridDimWG().y
34-
X[9] = gridDimWG().z
30+
X[7] = gridGroupDim().x
31+
X[8] = gridGroupDim().y
32+
X[9] = gridGroupDim().z
3533

3634
nothing
3735
end

test/device/output.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ end
123123
if idx == i
124124
@rocprintf "[%d] " idx
125125
end
126-
Device.sync_workgroup()
126+
sync_workgroup()
127127
end
128128
nothing
129129
end

test/device/vadd.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# The original test :)
22
@testset "Vector Addition Kernel" begin
33
function vadd(a,b,c)
4-
i = Device.workitemIdx().x
4+
i = workitemIdx().x
55
c[i] = a[i] + b[i]
6-
Device.sync_workgroup()
6+
sync_workgroup()
77
return nothing
88
end
99

test/device/wavefront.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
@testset "Wavefront Operations" begin
22
function reduce_kernel(op,X,Y)
3-
idx = AMDGPU.workitemIdx().x
3+
idx = workitemIdx().x
44
Y[1] = AMDGPU.Device.wfred(op,X[idx])
55
nothing
66
end
77
function scan_kernel(op,X,Y)
8-
idx = AMDGPU.workitemIdx().x
8+
idx = workitemIdx().x
99
Y[idx] = AMDGPU.Device.wfscan(op,X[idx],true)
1010
nothing
1111
end
1212
function bool_kernel(X,Y)
13-
idx = AMDGPU.workitemIdx().x
13+
idx = workitemIdx().x
1414
Y[1] = AMDGPU.Device.wfany(X[idx])
1515
Y[2] = AMDGPU.Device.wfall(X[idx])
1616
Y[3] = AMDGPU.Device.wfsame(X[idx])

0 commit comments

Comments
 (0)