|
| 1 | +module AMDGPUEnzymeCoreExt |
| 2 | + |
| 3 | +using AMDGPU |
| 4 | +using EnzymeCore |
| 5 | +using EnzymeCore: EnzymeRules |
| 6 | +using GPUCompiler |
| 7 | + |
| 8 | +include("meta_kernels.jl") |
| 9 | + |
| 10 | +function EnzymeCore.compiler_job_from_backend( |
| 11 | + ::ROCBackend, @nospecialize(F::Type), @nospecialize(TT::Type), |
| 12 | +) |
| 13 | + mi = GPUCompiler.methodinstance(F, TT) |
| 14 | + return GPUCompiler.CompilerJob(mi, AMDGPU.compiler_config(AMDGPU.device())) |
| 15 | +end |
| 16 | + |
| 17 | +function EnzymeRules.forward( |
| 18 | + config, fn::Const{typeof(AMDGPU.hipfunction)}, ::Type{<: Duplicated}, |
| 19 | + f::Const{F}, tt::Const{TT}; kwargs..., |
| 20 | +) where {F, TT} |
| 21 | + res = fn.val(f.val, tt.val; kwargs...) |
| 22 | + return Duplicated(res, res) |
| 23 | +end |
| 24 | + |
| 25 | +function EnzymeRules.forward( |
| 26 | + config, fn::Const{typeof(AMDGPU.hipfunction)}, ::Type{<: BatchDuplicated{T, N}}, |
| 27 | + f::Const{F}, tt::Const{TT}; kwargs..., |
| 28 | +) where {F, TT, T, N} |
| 29 | + res = fn.val(f.val, tt.val; kwargs...) |
| 30 | + return BatchDuplicated(res, ntuple(_ -> res, Val(N))) |
| 31 | +end |
| 32 | + |
| 33 | +function EnzymeRules.reverse( |
| 34 | + config, fn::Const{typeof(AMDGPU.hipfunction)}, ::Type{RT}, |
| 35 | + subtape, f, tt; kwargs..., |
| 36 | +) where RT |
| 37 | + return (nothing, nothing) |
| 38 | +end |
| 39 | + |
| 40 | +function EnzymeRules.forward( |
| 41 | + config, fn::Const{typeof(AMDGPU.rocconvert)}, ::Type{RT}, x::IT, |
| 42 | +) where {RT, IT} |
| 43 | + if EnzymeRules.needs_primal(config) && EnzymeRules.needs_shadow(config) |
| 44 | + config_width = EnzymeRules.width(config) |
| 45 | + if config_width == 1 |
| 46 | + Duplicated(fn.val(x.val), fn.val(x.dval)) |
| 47 | + else |
| 48 | + tup = ntuple(Val(config_width)) do i |
| 49 | + Base.@_inline_meta |
| 50 | + fn.val(x.dval[i])::eltype(RT) |
| 51 | + end |
| 52 | + BatchDuplicated(fn.val(x.val), tup) |
| 53 | + end |
| 54 | + |
| 55 | + elseif EnzymeRules.needs_shadow(config) |
| 56 | + config_width = EnzymeRules.width(config) |
| 57 | + ST = EnzymeCore.shadow_type(config, RT) |
| 58 | + if config_width == 1 |
| 59 | + fn.val(x.dval)::ST |
| 60 | + else |
| 61 | + (ntuple(Val(config_width)) do i |
| 62 | + Base.@_inline_meta |
| 63 | + fn.val(x.dval[i])::eltype(RT) |
| 64 | + end)::ST |
| 65 | + end |
| 66 | + |
| 67 | + elseif EnzymeRules.needs_primal(config) |
| 68 | + fn.val(x.val)::eltype(RT) |
| 69 | + else |
| 70 | + nothing |
| 71 | + end |
| 72 | +end |
| 73 | + |
| 74 | +function EnzymeRules.augmented_primal( |
| 75 | + config, fn::Const{typeof(AMDGPU.rocconvert)}, ::Type{RT}, x::IT, |
| 76 | +) where {RT, IT} |
| 77 | + primal = EnzymeRules.needs_primal(config) ? |
| 78 | + fn.val(x.val) : nothing |
| 79 | + |
| 80 | + shadow = if EnzymeRules.needs_shadow(config) |
| 81 | + config_width = EnzymeRules.width(config) |
| 82 | + if config_width == 1 |
| 83 | + fn.val(x.dval) |
| 84 | + else |
| 85 | + ntuple(Val(config_width)) do i |
| 86 | + Base.@_inline_meta |
| 87 | + fn.val(x.dval[i]) |
| 88 | + end |
| 89 | + end |
| 90 | + else |
| 91 | + nothing |
| 92 | + end |
| 93 | + |
| 94 | + return EnzymeRules.AugmentedReturn{ |
| 95 | + EnzymeRules.primal_type(config, RT), |
| 96 | + EnzymeRules.shadow_type(config, RT), Nothing |
| 97 | + }(primal, shadow, nothing) |
| 98 | +end |
| 99 | + |
| 100 | +function EnzymeRules.reverse( |
| 101 | + config, fn::Const{typeof(AMDGPU.rocconvert)}, ::Type{RT}, tape, x::IT, |
| 102 | +) where {RT, IT} |
| 103 | + return (nothing,) |
| 104 | +end |
| 105 | + |
| 106 | +function EnzymeRules.forward( |
| 107 | + config, fn::EnzymeCore.Annotation{AMDGPU.Runtime.HIPKernel{F, TT}}, |
| 108 | + ::Type{Const{Nothing}}, args...; kwargs..., |
| 109 | +) where {F, TT} |
| 110 | + GC.@preserve args begin |
| 111 | + kernel_args = ((rocconvert(a) for a in args)...,) |
| 112 | + kernel_tt = Tuple{(typeof(config), F, (typeof(a) for a in kernel_args)...)...} |
| 113 | + kernel = AMDGPU.hipfunction(meta_fn, kernel_tt) |
| 114 | + kernel(config, fn.val.f, kernel_args...; kwargs...) |
| 115 | + end |
| 116 | + return |
| 117 | +end |
| 118 | + |
| 119 | +function EnzymeRules.reverse( |
| 120 | + config, ofn::EnzymeCore.Annotation{AMDGPU.Runtime.HIPKernel{F, TT}}, |
| 121 | + ::Type{Const{Nothing}}, subtape, args...; |
| 122 | + groupsize::AMDGPU.Runtime.ROCDim = 1, |
| 123 | + gridsize::AMDGPU.Runtime.ROCDim = 1, |
| 124 | + kwargs..., |
| 125 | +) where {F, TT} |
| 126 | + kernel_args = ((rocconvert(a) for a in args)...,) |
| 127 | + kernel_tt = map(typeof, kernel_args) |
| 128 | + |
| 129 | + ModifiedBetween = EnzymeRules.overwritten(config) |
| 130 | + TapeType = EnzymeCore.tape_type( |
| 131 | + ReverseSplitModified( |
| 132 | + EnzymeCore.set_runtime_activity(ReverseSplitWithPrimal, config), |
| 133 | + Val(ModifiedBetween)), |
| 134 | + Const{F}, |
| 135 | + Const{Nothing}, |
| 136 | + kernel_tt..., |
| 137 | + ) |
| 138 | + groupsize = AMDGPU.Runtime.ROCDim3(groupsize) |
| 139 | + gridsize = AMDGPU.Runtime.ROCDim3(gridsize) |
| 140 | + |
| 141 | + GC.@preserve args subtape begin |
| 142 | + subtape_cc = rocconvert(subtape) |
| 143 | + kernel_tt2 = Tuple{ |
| 144 | + (typeof(config), F, typeof(subtape_cc), kernel_tt...)...} |
| 145 | + kernel = AMDGPU.hipfunction(meta_revf, kernel_tt2) |
| 146 | + kernel(config, ofn.val.f, subtape_cc, kernel_args...; |
| 147 | + groupsize, gridsize, kwargs...) |
| 148 | + end |
| 149 | + |
| 150 | + return ntuple(Val(length(kernel_args))) do i |
| 151 | + Base.@_inline_meta |
| 152 | + nothing |
| 153 | + end |
| 154 | +end |
| 155 | + |
| 156 | +function EnzymeRules.augmented_primal( |
| 157 | + config, fn::Const{typeof(AMDGPU.hipfunction)}, |
| 158 | + ::Type{RT}, f::Const{F}, tt::Const{TT}; kwargs... |
| 159 | +) where {F, CT, RT <: EnzymeCore.Annotation{CT}, TT} |
| 160 | + res = fn.val(f.val, tt.val; kwargs...) |
| 161 | + primal = EnzymeRules.needs_primal(config) ? res : nothing |
| 162 | + |
| 163 | + shadow = if EnzymeRules.needs_shadow(config) |
| 164 | + config_width = EnzymeRules.width(config) |
| 165 | + config_width == 1 ? |
| 166 | + res : |
| 167 | + ntuple(Val(config_width)) do i |
| 168 | + Base.@_inline_meta |
| 169 | + res |
| 170 | + end |
| 171 | + else |
| 172 | + nothing |
| 173 | + end |
| 174 | + |
| 175 | + return EnzymeRules.AugmentedReturn{ |
| 176 | + EnzymeRules.primal_type(config, RT), |
| 177 | + EnzymeRules.shadow_type(config, RT), Nothing, |
| 178 | + }(primal, shadow, nothing) |
| 179 | +end |
| 180 | + |
| 181 | +function EnzymeRules.augmented_primal( |
| 182 | + config, fn::EnzymeCore.Annotation{AMDGPU.Runtime.HIPKernel{F,TT}}, |
| 183 | + ::Type{Const{Nothing}}, args...; |
| 184 | + groupsize::AMDGPU.Runtime.ROCDim = 1, |
| 185 | + gridsize::AMDGPU.Runtime.ROCDim = 1, kwargs..., |
| 186 | +) where {F,TT} |
| 187 | + kernel_args = ((rocconvert(a) for a in args)...,) |
| 188 | + kernel_tt = map(typeof, kernel_args) |
| 189 | + |
| 190 | + ModifiedBetween = EnzymeRules.overwritten(config) |
| 191 | + compiler_job = EnzymeCore.compiler_job_from_backend( |
| 192 | + ROCBackend(), typeof(Base.identity), Tuple{Float64}) |
| 193 | + TapeType = EnzymeCore.tape_type( |
| 194 | + compiler_job, |
| 195 | + ReverseSplitModified( |
| 196 | + EnzymeCore.set_runtime_activity(ReverseSplitWithPrimal, config), |
| 197 | + Val(ModifiedBetween)), |
| 198 | + Const{F}, Const{Nothing}, |
| 199 | + kernel_tt..., |
| 200 | + ) |
| 201 | + groupsize = AMDGPU.Runtime.ROCDim3(groupsize) |
| 202 | + gridsize = AMDGPU.Runtime.ROCDim3(gridsize) |
| 203 | + subtape = ROCArray{TapeType}(undef, |
| 204 | + gridsize.x * gridsize.y * gridsize.z * |
| 205 | + groupsize.x * groupsize.y * groupsize.z) |
| 206 | + |
| 207 | + GC.@preserve args subtape begin |
| 208 | + subtape_cc = rocconvert(subtape) |
| 209 | + kernel_tt2 = Tuple{ |
| 210 | + (typeof(config), F, typeof(subtape_cc), kernel_tt...)...} |
| 211 | + kernel = AMDGPU.hipfunction(meta_augf, kernel_tt2) |
| 212 | + kernel(config, fn.val.f, subtape_cc, kernel_args...; |
| 213 | + groupsize, gridsize, kwargs...) |
| 214 | + end |
| 215 | + return EnzymeRules.AugmentedReturn{Nothing, Nothing, ROCArray}(nothing, nothing, subtape) |
| 216 | +end |
| 217 | + |
| 218 | +end |
0 commit comments