Skip to content

Commit

Permalink
Transition to HIP in AMDGPU (#10)
Browse files Browse the repository at this point in the history
Use HIP for  AMDGPU extension and other minor fixes.
I am merging this although Enzyme is not yet working with AMDGPU, not related to TK (see JuliaGPU/AMDGPU.jl#444)
  • Loading branch information
luraess authored Jul 16, 2023
2 parents 3879c88 + 08d8e41 commit 38aaec3
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 58 deletions.
26 changes: 16 additions & 10 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,26 +1,32 @@
name = "TinyKernels"
uuid = "f7cbc414-f748-44bf-86e6-e44e9a55e39d"
authors = ["Ivan Utkin <[email protected]> and Ludovic Raess <[email protected]>"]
version = "0.3.2"
version = "0.4.0"

[deps]
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"

[compat]
AMDGPU = "0.5"
CUDA = "4"
Enzyme = "0.11"
Metal = "0.5"

[weakdeps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"

[extensions]
CUDAExt = "CUDA"
AMDGPUExt = "AMDGPU"
MetalExt = "Metal"
CUDAExt = "CUDA"
EnzymeExt = "Enzyme"
MetalExt = "Metal"

[extras]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
9 changes: 2 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,15 @@

TinyKernels.jl is mostly a heavily stripped-down version of [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) supporting the bare minimum of the features. This package provides a sandbox for Julia GPU tooling and to measure the performance of kernels in a GPU-agnostic way. While the API of KernelAbstractions.jl is in a "transient" state, this package will provide the thin abstraction layer on top the [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl), [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) and [Metal.jl](https://github.com/JuliaGPU/Metal.jl) packages.

TinyKernels.jl allows to explicitly launch GPU kernels asynchronously on different streams or queues with given priority. This feature facilitates the overlap between computations and memory transfers in distributed configurations.
TinyKernels.jl allows to explicitly launch GPU kernels asynchronously on different streams with given priority. This feature facilitates the overlap between computations and memory transfers in distributed configurations.

TinyKernels.jl supports automatic differentiation with [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl) overloading the `Enzyme.autodiff` function to enable reverse mode AD of GPU (and CPU) kernels.

Preliminary benchmarks can be found in [TinyBenchmarks.jl](https://github.com/luraess/TinyBenchmarks.jl) and Metal playground in [MetalGPU](https://github.com/luraess/MetalGPU).

Stay tuned :rocket:

### Compat
- AMDGPU ≥ v0.4.8
- CUDA ≥ 3.13
- Metal ≥ v0.3.0
### Notes

⚠️ **Metal backend:**
- Only `Float32` is being supported. For `Float64`, one could try using a construct from [DoubleFloats.jl](https://github.com/JuliaMath/DoubleFloats.jl/blob/ef689ccbab37d84943e2533309d34c6665229cab/src/Double.jl#L30) _which may impact performance_.
- Automatic differentiation (AD) capabilities (Enzyme.jl) are currently not working on ARM GPU (Metal) and giving erroneous results on ARM CPU.
- Only `Float32` is being supported. For `Float64`, one could try using a construct from [DoubleFloats.jl](https://github.com/JuliaMath/DoubleFloats.jl/blob/ef689ccbab37d84943e2533309d34c6665229cab/src/Double.jl#L30) _which may impact performance_.
2 changes: 1 addition & 1 deletion examples/example_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ function main(::Type{DAT}; device) where DAT
end

println("running on $backend device...")
main(eletype; device)
main(eletype; device)
60 changes: 28 additions & 32 deletions ext/AMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,70 +9,66 @@ else
end

import TinyKernels: AMDGPUDevice, AbstractEvent, Kernel
import TinyKernels: device_array, device_synchronize, __get_index, ndrange_to_indices
import TinyKernels: device_array, device_synchronize, synchronize, __get_index, ndrange_to_indices

import Base: wait

struct AMDGPUEvent <: AbstractEvent
signal::AMDGPU.ROCSignal
queue::AMDGPU.ROCQueue
event::AMDGPU.HIP.HIPEvent
end

wait(ev::AMDGPUEvent) = wait(ev.signal; queue=ev.queue)
wait(ev::AMDGPUEvent) = AMDGPU.HIP.synchronize(ev.event)
wait(evs::AbstractArray{AMDGPUEvent}) = wait.(evs)

mutable struct QueuePool
next_queue_idx::Int
queues::Vector{AMDGPU.ROCQueue}
mutable struct StreamPool
next_stream_idx::Int
streams::Vector{AMDGPU.HIPStream}
end

const MAX_QUEUES = 2
const QUEUES = Dict{Symbol,QueuePool}()
const MAX_STREAMS = 6
const STREAMS = Dict{Symbol,StreamPool}()

function get_queue(priority::Symbol)
pool = get!(QUEUES, priority) do
max_queues = MAX_QUEUES
function get_stream(priority::Symbol)
pool = get!(STREAMS, priority) do
max_streams = MAX_STREAMS
roc_priority = if priority == :high
:high
elseif priority == :low
:low
else
error("unknown priority $priority")
end
QueuePool(1, [AMDGPU.ROCQueue(AMDGPU.default_device(); priority=roc_priority) for _ in 1:max_queues])
StreamPool(0, [AMDGPU.HIPStream(roc_priority) for _ in 1:max_streams])
end
return pick_queue(pool)
return pick_stream(pool)
end

function pick_queue(pool::QueuePool)
# round-robin queue selection
pool.next_queue_idx += 1
pool.next_queue_idx = ((pool.next_queue_idx - 1) % length(pool.queues)) + 1
return pool.queues[pool.next_queue_idx]
function pick_stream(pool::StreamPool)
# round-robin stream selection
pool.next_stream_idx += 1
pool.next_stream_idx = ((pool.next_stream_idx - 1) % length(pool.streams)) + 1
return pool.streams[pool.next_stream_idx]
end

function (k::Kernel{<:AMDGPUDevice})(args...; ndrange, priority=:low, nthreads=nothing)
ndrange = ndrange_to_indices(ndrange)
if isnothing(nthreads)
nthreads = min(length(ndrange), 256)
end
ngrid = length(ndrange)
# create signal
sig = AMDGPU.ROCSignal()
# launch kernel
queue = get_queue(priority)
AMDGPU.HSA.signal_store_screlease(sig.signal, 1)
AMDGPU.@roc wait=false mark=false signal=sig groupsize=nthreads gridsize=ngrid queue=queue k.fun(ndrange, args...)
return AMDGPUEvent(sig, queue)
nblocks = cld(length(ndrange), nthreads)
stream = get_stream(priority)
event = AMDGPU.HIP.HIPEvent(stream)
AMDGPU.@roc groupsize=nthreads gridsize=nblocks stream=stream k.fun(ndrange, args...)
AMDGPU.HIP.record(event)
return AMDGPUEvent(event)
end

device_array(::Type{T}, ::AMDGPUDevice, dims...) where T = AMDGPU.ROCArray{T}(undef, dims)

function device_synchronize(::AMDGPUDevice)
wait(AMDGPU.barrier_and!(AMDGPU.default_queue(), AMDGPU.active_kernels(AMDGPU.default_queue())))
return
end
device_synchronize(::AMDGPUDevice) = AMDGPU.HIP.device_synchronize()

synchronize(::AMDGPUDevice) = AMDGPU.synchronize()

@device_override @inline __get_index() = (AMDGPU.workgroupIdx().x-1)*AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x

end
end # module
8 changes: 5 additions & 3 deletions ext/CUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ else
end

import TinyKernels: CUDADevice, AbstractEvent, Kernel
import TinyKernels: device_array, device_synchronize, __get_index, ndrange_to_indices
import TinyKernels: device_array, device_synchronize, synchronize, __get_index, ndrange_to_indices

import Base: wait

Expand Down Expand Up @@ -39,7 +39,7 @@ function get_stream(priority::Symbol)
else
error("unknown priority $priority")
end
StreamPool(1, [CUDA.CuStream(; priority=cu_priority) for _ in 1:max_streams])
StreamPool(0, [CUDA.CuStream(; priority=cu_priority) for _ in 1:max_streams])
end
return pick_stream(pool)
end
Expand Down Expand Up @@ -69,7 +69,9 @@ end

device_array(::Type{T}, ::CUDADevice, dims...) where T = CUDA.CuArray{T}(undef, dims)

device_synchronize(::CUDADevice) = CUDA.synchronize()
device_synchronize(::CUDADevice) = CUDA.device_synchronize()

synchronize(::CUDADevice) = CUDA.synchronize()

@device_override @inline __get_index() = (CUDA.blockIdx().x-1)*CUDA.blockDim().x + CUDA.threadIdx().x

Expand Down
2 changes: 1 addition & 1 deletion ext/EnzymeExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import TinyKernels: AbstractGPUDevice, CPUDevice, Kernel
function Enzyme.autodiff(kernel::Kernel{<:AbstractGPUDevice, Fun}) where Fun
fun = kernel.fun
function df(ctx, args...)
Enzyme.autodiff_deferred(fun::Fun, Enzyme.Const, ctx, args...)
Enzyme.autodiff_deferred(Enzyme.Reverse, fun::Fun, Enzyme.Const, ctx, args...)
return
end
similar(kernel, df)
Expand Down
8 changes: 5 additions & 3 deletions ext/MetalExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ else
end

import TinyKernels: MetalDevice, AbstractEvent, Kernel
import TinyKernels: device_array, device_synchronize, __get_index, ndrange_to_indices
import TinyKernels: device_array, device_synchronize, synchronize, __get_index, ndrange_to_indices

import Base: wait

Expand All @@ -32,7 +32,7 @@ function get_queue(priority::Symbol) # no priority selection yet
pool = get!(QUEUES, priority) do
max_queues = MAX_QUEUES
dev = Metal.current_device()
QueuePool(1, [Metal.MTLCommandQueue(dev) for _ in 1:max_queues])
QueuePool(0, [Metal.MTLCommandQueue(dev) for _ in 1:max_queues])
end
return pick_queue(pool)
end
Expand All @@ -58,7 +58,9 @@ end

device_array(::Type{T}, ::MetalDevice, dims...) where {T} = Metal.MtlArray{T}(undef, dims)

device_synchronize(::MetalDevice) = Metal.synchronize() # device_synchronize() forces device sync
device_synchronize(::MetalDevice) = Metal.device_synchronize() # device_synchronize() forces device sync

synchronize(::MetalDevice) = Metal.synchronize()

@device_override @inline __get_index() = Metal.thread_position_in_grid_1d()

Expand Down
4 changes: 3 additions & 1 deletion src/TinyKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module TinyKernels

export Kernel, AbstractDevice, AbstractGPUDevice
export CPUDevice, CUDADevice, AMDGPUDevice, MetalDevice
export device_array, device_synchronize, @tiny, @indices, @linearindex, @cartesianindex
export device_array, device_synchronize, synchronize, @tiny, @indices, @linearindex, @cartesianindex

if !isdefined(Base, :get_extension)
using Requires
Expand Down Expand Up @@ -48,6 +48,8 @@ function device_array end

function device_synchronize end

function synchronize end

function __get_index end

include("macros.jl")
Expand Down

0 comments on commit 38aaec3

Please sign in to comment.