Transition to HIP in AMDGPU (#10)

Use HIP for AMDGPU extension and other minor fixes. I am merging this although Enzyme is not yet working with AMDGPU, not related to TK (see JuliaGPU/AMDGPU.jl#444)
utkinis · Jul 16, 2023 · 38aaec3 · 38aaec3
2 parents 3879c88 + 08d8e41
commit 38aaec3
Show file tree

Hide file tree

Showing 8 changed files with 61 additions and 58 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,26 +1,32 @@
 name = "TinyKernels"
 uuid = "f7cbc414-f748-44bf-86e6-e44e9a55e39d"
 authors = ["Ivan Utkin <[email protected]> and Ludovic Raess <[email protected]>"]
-version = "0.3.2"
+version = "0.4.0"
 
 [deps]
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 
+[compat]
+AMDGPU = "0.5"
+CUDA = "4"
+Enzyme = "0.11"
+Metal = "0.5"
+
+[weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+
 [extensions]
-CUDAExt = "CUDA"
 AMDGPUExt = "AMDGPU"
-MetalExt = "Metal"
+CUDAExt = "CUDA"
 EnzymeExt = "Enzyme"
+MetalExt = "Metal"
 
 [extras]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
-Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
-
-[weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
diff --git a/README.md b/README.md
@@ -6,20 +6,15 @@
 
 TinyKernels.jl is mostly a heavily stripped-down version of [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) supporting the bare minimum of the features. This package provides a sandbox for Julia GPU tooling and to measure the performance of kernels in a GPU-agnostic way. While the API of KernelAbstractions.jl is in a "transient" state, this package will provide the thin abstraction layer on top the [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl), [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) and [Metal.jl](https://github.com/JuliaGPU/Metal.jl) packages.
 
-TinyKernels.jl allows to explicitly launch GPU kernels asynchronously on different streams or queues with given priority. This feature facilitates the overlap between computations and memory transfers in distributed configurations.
+TinyKernels.jl allows to explicitly launch GPU kernels asynchronously on different streams with given priority. This feature facilitates the overlap between computations and memory transfers in distributed configurations.
 
 TinyKernels.jl supports automatic differentiation with [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl) overloading the `Enzyme.autodiff` function to enable reverse mode AD of GPU (and CPU) kernels.
 
 Preliminary benchmarks can be found in [TinyBenchmarks.jl](https://github.com/luraess/TinyBenchmarks.jl) and Metal playground in [MetalGPU](https://github.com/luraess/MetalGPU).
 
 Stay tuned :rocket:
 
-### Compat
-- AMDGPU ≥ v0.4.8
-- CUDA ≥ 3.13
-- Metal ≥ v0.3.0
 ### Notes
 
 ⚠️ **Metal backend:**
-- Only `Float32` is being supported. For `Float64`, one could try using a construct from [DoubleFloats.jl](https://github.com/JuliaMath/DoubleFloats.jl/blob/ef689ccbab37d84943e2533309d34c6665229cab/src/Double.jl#L30) _which may impact performance_.
-- Automatic differentiation (AD) capabilities (Enzyme.jl) are currently not working on ARM GPU (Metal) and giving erroneous results on ARM CPU.
+- Only `Float32` is being supported. For `Float64`, one could try using a construct from [DoubleFloats.jl](https://github.com/JuliaMath/DoubleFloats.jl/blob/ef689ccbab37d84943e2533309d34c6665229cab/src/Double.jl#L30) _which may impact performance_.
diff --git a/examples/example_2d.jl b/examples/example_2d.jl
@@ -51,4 +51,4 @@ function main(::Type{DAT}; device) where DAT
 end
 
 println("running on $backend device...")
-main(eletype; device)
+main(eletype; device)
diff --git a/ext/AMDGPUExt.jl b/ext/AMDGPUExt.jl
@@ -9,70 +9,66 @@ else
 end
 
 import TinyKernels: AMDGPUDevice, AbstractEvent, Kernel
-import TinyKernels: device_array, device_synchronize, __get_index,  ndrange_to_indices
+import TinyKernels: device_array, device_synchronize, synchronize, __get_index,  ndrange_to_indices
 
 import Base: wait
 
 struct AMDGPUEvent <: AbstractEvent
-    signal::AMDGPU.ROCSignal
-    queue::AMDGPU.ROCQueue
+    event::AMDGPU.HIP.HIPEvent
 end
 
-wait(ev::AMDGPUEvent) = wait(ev.signal; queue=ev.queue)
+wait(ev::AMDGPUEvent) = AMDGPU.HIP.synchronize(ev.event)
 wait(evs::AbstractArray{AMDGPUEvent}) = wait.(evs)
 
-mutable struct QueuePool
-    next_queue_idx::Int
-    queues::Vector{AMDGPU.ROCQueue}
+mutable struct StreamPool
+    next_stream_idx::Int
+    streams::Vector{AMDGPU.HIPStream}
 end
 
-const MAX_QUEUES = 2
-const QUEUES = Dict{Symbol,QueuePool}()
+const MAX_STREAMS = 6
+const STREAMS = Dict{Symbol,StreamPool}()
 
-function get_queue(priority::Symbol)
-    pool = get!(QUEUES, priority) do
-        max_queues = MAX_QUEUES
+function get_stream(priority::Symbol)
+    pool = get!(STREAMS, priority) do
+        max_streams = MAX_STREAMS
         roc_priority = if priority == :high
             :high
         elseif priority == :low
             :low
         else
             error("unknown priority $priority")
         end
-        QueuePool(1, [AMDGPU.ROCQueue(AMDGPU.default_device(); priority=roc_priority) for _ in 1:max_queues])
+        StreamPool(0, [AMDGPU.HIPStream(roc_priority) for _ in 1:max_streams])
     end
-    return pick_queue(pool)
+    return pick_stream(pool)
 end
 
-function pick_queue(pool::QueuePool)
-    # round-robin queue selection
-    pool.next_queue_idx += 1
-    pool.next_queue_idx = ((pool.next_queue_idx - 1) % length(pool.queues)) + 1
-    return pool.queues[pool.next_queue_idx]
+function pick_stream(pool::StreamPool)
+    # round-robin stream selection
+    pool.next_stream_idx += 1
+    pool.next_stream_idx = ((pool.next_stream_idx - 1) % length(pool.streams)) + 1
+    return pool.streams[pool.next_stream_idx]
 end
 
 function (k::Kernel{<:AMDGPUDevice})(args...; ndrange, priority=:low, nthreads=nothing)
     ndrange = ndrange_to_indices(ndrange)
     if isnothing(nthreads)
         nthreads = min(length(ndrange), 256)
     end
-    ngrid = length(ndrange)
-    # create signal
-    sig = AMDGPU.ROCSignal()
-    # launch kernel
-    queue = get_queue(priority)
-    AMDGPU.HSA.signal_store_screlease(sig.signal, 1)
-    AMDGPU.@roc wait=false mark=false signal=sig groupsize=nthreads gridsize=ngrid queue=queue k.fun(ndrange, args...)
-    return AMDGPUEvent(sig, queue)
+    nblocks = cld(length(ndrange), nthreads)
+    stream = get_stream(priority)
+    event = AMDGPU.HIP.HIPEvent(stream)
+    AMDGPU.@roc groupsize=nthreads gridsize=nblocks stream=stream k.fun(ndrange, args...)
+    AMDGPU.HIP.record(event)
+    return AMDGPUEvent(event)
 end
 
 device_array(::Type{T}, ::AMDGPUDevice, dims...) where T = AMDGPU.ROCArray{T}(undef, dims)
 
-function device_synchronize(::AMDGPUDevice)
-    wait(AMDGPU.barrier_and!(AMDGPU.default_queue(), AMDGPU.active_kernels(AMDGPU.default_queue())))
-    return
-end
+device_synchronize(::AMDGPUDevice) = AMDGPU.HIP.device_synchronize()
+
+synchronize(::AMDGPUDevice) = AMDGPU.synchronize()
 
 @device_override @inline __get_index() = (AMDGPU.workgroupIdx().x-1)*AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x
 
-end
+end # module
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
@@ -9,7 +9,7 @@ else
 end
 
 import TinyKernels: CUDADevice, AbstractEvent, Kernel
-import TinyKernels: device_array, device_synchronize, __get_index, ndrange_to_indices
+import TinyKernels: device_array, device_synchronize, synchronize, __get_index, ndrange_to_indices
 
 import Base: wait
 
@@ -39,7 +39,7 @@ function get_stream(priority::Symbol)
         else
             error("unknown priority $priority")
         end
-        StreamPool(1, [CUDA.CuStream(; priority=cu_priority) for _ in 1:max_streams])
+        StreamPool(0, [CUDA.CuStream(; priority=cu_priority) for _ in 1:max_streams])
     end
     return pick_stream(pool)
 end
@@ -69,7 +69,9 @@ end
 
 device_array(::Type{T}, ::CUDADevice, dims...) where T = CUDA.CuArray{T}(undef, dims)
 
-device_synchronize(::CUDADevice) = CUDA.synchronize()
+device_synchronize(::CUDADevice) = CUDA.device_synchronize()
+
+synchronize(::CUDADevice) = CUDA.synchronize()
 
 @device_override @inline __get_index() = (CUDA.blockIdx().x-1)*CUDA.blockDim().x + CUDA.threadIdx().x
 

diff --git a/ext/EnzymeExt.jl b/ext/EnzymeExt.jl
@@ -7,7 +7,7 @@ import TinyKernels: AbstractGPUDevice, CPUDevice, Kernel
 function Enzyme.autodiff(kernel::Kernel{<:AbstractGPUDevice, Fun}) where Fun
     fun = kernel.fun
     function df(ctx, args...)
-        Enzyme.autodiff_deferred(fun::Fun, Enzyme.Const, ctx, args...)
+        Enzyme.autodiff_deferred(Enzyme.Reverse, fun::Fun, Enzyme.Const, ctx, args...)
         return
     end
     similar(kernel, df)

diff --git a/ext/MetalExt.jl b/ext/MetalExt.jl
@@ -9,7 +9,7 @@ else
 end
 
 import TinyKernels: MetalDevice, AbstractEvent, Kernel
-import TinyKernels: device_array, device_synchronize, __get_index, ndrange_to_indices
+import TinyKernels: device_array, device_synchronize, synchronize, __get_index, ndrange_to_indices
 
 import Base: wait
 
@@ -32,7 +32,7 @@ function get_queue(priority::Symbol) # no priority selection yet
     pool = get!(QUEUES, priority) do
         max_queues = MAX_QUEUES
         dev = Metal.current_device()
-        QueuePool(1, [Metal.MTLCommandQueue(dev) for _ in 1:max_queues])
+        QueuePool(0, [Metal.MTLCommandQueue(dev) for _ in 1:max_queues])
     end
     return pick_queue(pool)
 end
@@ -58,7 +58,9 @@ end
 
 device_array(::Type{T}, ::MetalDevice, dims...) where {T} = Metal.MtlArray{T}(undef, dims)
 
-device_synchronize(::MetalDevice) = Metal.synchronize() # device_synchronize() forces device sync
+device_synchronize(::MetalDevice) = Metal.device_synchronize() # device_synchronize() forces device sync
+
+synchronize(::MetalDevice) = Metal.synchronize()
 
 @device_override @inline __get_index() = Metal.thread_position_in_grid_1d()
 

diff --git a/src/TinyKernels.jl b/src/TinyKernels.jl
@@ -2,7 +2,7 @@ module TinyKernels
 
 export Kernel, AbstractDevice, AbstractGPUDevice
 export CPUDevice, CUDADevice, AMDGPUDevice, MetalDevice
-export device_array, device_synchronize, @tiny, @indices, @linearindex, @cartesianindex
+export device_array, device_synchronize, synchronize, @tiny, @indices, @linearindex, @cartesianindex
 
 if !isdefined(Base, :get_extension)
     using Requires
@@ -48,6 +48,8 @@ function device_array end
 
 function device_synchronize end
 
+function synchronize end
+
 function __get_index end
 
 include("macros.jl")