AMDGPU JIT compiler (#453)

* feat(AMD GPU): initial commit to support AMD GPU (working for ASM but not object code) * feat(AMD GPU): 🔥 🔥 end to end JIT compilation to AMD GPU is working
mratsim · Aug 5, 2024 · 1e34ec2 · 1e34ec2
1 parent cfe077b
commit 1e34ec2
Show file tree

Hide file tree

Showing 13 changed files with 2,093 additions and 36 deletions.
diff --git a/constantine/math_compiler/codegen_amdgpu.nim b/constantine/math_compiler/codegen_amdgpu.nim
@@ -0,0 +1,188 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/abis/amdgpu_abi {.all.},
+  constantine/platforms/abis/amdcomgr_abi,
+  constantine/platforms/abis/c_abi,
+  constantine/platforms/llvm/llvm,
+  constantine/platforms/primitives,
+  ./ir
+
+export
+  amdgpu_abi,
+  Flag, flag, wrapOpenArrayLenType
+
+# ############################################################
+#
+#                     AMD GPUs API
+#
+# ############################################################
+
+# Hip Runtime API
+# ------------------------------------------------------------
+
+template check*(status: HipError) =
+  ## Check the status code of a Hip operation
+  ## Exit program with error if failure
+
+  let code = status # ensure that the input expression is evaluated once only
+  if code != hipSuccess:
+    writeStackTrace()
+    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+    quit 1
+
+func hipModuleGetFunction*(kernel: var HipFunction, module: HipModule, fnName: openArray[char]): HipError {.inline.}=
+  hipModuleGetFunction(kernel, module, fnName[0].unsafeAddr)
+
+proc getGcnArchName*(deviceID: int32): string =
+  var prop: HipDeviceProp
+  check hipGetDeviceProperties(prop, deviceID)
+
+  for c in prop.gcnArchName:
+    if c != '\0':
+      result.add c
+
+proc hipDeviceInit*(deviceID = 0'i32): HipDevice =
+
+  check hipInit(deviceID.uint32)
+
+  var devCount: int32
+  check hipGetDeviceCount(devCount)
+  if devCount == 0:
+    echo "hipDeviceInit error: no devices supporting AMD ROCm/HIP"
+    quit 1
+
+  var hipDevice: HipDevice
+  check hipDeviceGet(hipDevice, deviceID)
+  var name = newString(128)
+  check hipDeviceGetName(name[0].addr, name.len.int32, hipDevice)
+  echo "Using HIP Device [", deviceID, "]: ", cstring(name)
+  echo "AMD GCN ARCH: ", deviceID.getGcnArchName()
+
+  return hipDevice
+
+# ############################################################
+#
+#                   LLVM IR for AMD GPUs
+#
+# ############################################################
+#
+# Note:
+#   __device__ functions for field and elliptic curve arithmetic
+#   might be compiled by default with scalar codegen
+#
+#   We will need to either:
+#   - Derive explicitly a vectorized version of the warp/wave size (32)
+#   - Derive implicitly a vectorized version, probably with __forceinline__
+
+proc wrapInCallableHipKernel*(module: ModuleRef, fn: FnDef) =
+  ## Create a public wrapper of a Hip device function
+  ##
+  ## A function named `addmod` can be found by appending _public
+  ##   check hipModuleGetFunction(fnPointer, cuModule, "addmod_public")
+
+  let pubName = fn.fnImpl.getName() & "_public"
+  let pubFn = module.addFunction(cstring(pubName), fn.fnTy)
+
+  let ctx = module.getContext()
+  let builder = ctx.createBuilder()
+  defer: builder.dispose()
+
+  let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
+  builder.positionAtEnd(blck)
+
+  var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
+  for i, arg in mpairs(args):
+    arg = pubFn.getParam(i.uint32)
+  discard builder.call2(fn.fnTy, fn.fnImpl, args)
+
+  # A public kernel must return void
+  builder.retVoid()
+  pubFn.setCallingConvention(AMDGPU_KERNEL)
+
+# ############################################################
+#
+#                      Code generation
+#
+# ############################################################
+
+template check*(status: ComgrStatus) =
+  ## Check the status code of a Comgr operation
+  ## Exit program with error if failure
+
+  let code = status # ensure that the input expression is evaluated once only
+  if code != AMD_COMGR_STATUS_SUCCESS:
+    writeStackTrace()
+    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+    quit 1
+
+
+proc linkAmdGpu*(reloc_obj: seq[byte], gcnArchName: string): seq[byte] {.noInline.} =
+  ## Link a relocatable object code
+  ## into an executable that can be used through hipModuleLoadData
+  var roc: ComgrData
+  check amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, roc)
+  defer: check amd_comgr_release_data(roc)
+
+  var ai: ComgrActionInfo
+  check amd_comgr_create_action_info(ai)
+  defer: check amd_comgr_destroy_action_info(ai)
+
+  var ds: ComgrDataset
+  check amd_comgr_create_data_set(ds)
+  defer: check amd_comgr_destroy_data_set(ds)
+
+  var dsOut: ComgrDataset
+  check amd_comgr_create_data_set(dsOut)
+  defer: check amd_comgr_destroy_data_set(dsOut)
+
+  check roc.amd_comgr_set_data(reloc_obj.len.csize_t(), reloc_obj[0].addr)
+  check roc.amd_comgr_set_data_name("linkAmdGpu-input.o")
+  check ds.amd_comgr_data_set_add(roc)
+
+  check ai.amd_comgr_action_info_set_isa_name(
+    cstring("amdgcn-amd-amdhsa--" & gcnArchName)
+  )
+
+  check amd_comgr_do_action(
+    AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
+    info = ai,
+    input = ds,
+    output = dsOut)
+
+  # Extract the executable
+  # ------------------------------------------------
+
+  var exe: ComgrData
+  check amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, exe)
+  defer: check amd_comgr_release_data(exe)
+
+  check amd_comgr_action_data_get_data(
+    dsOut, AMD_COMGR_DATA_KIND_EXECUTABLE,
+    index = 0, exe)
+
+  # Query the required buffer size
+  var size: csize_t
+  check amd_comgr_get_data(
+    exe, size, nil)
+
+  # Size includes nul char
+  # But we use seq[byte] not a string, so Nim doesn't auto-inster a \0
+  # Hence allocation size is exact.
+  result.setLen(int size)
+
+  check amd_comgr_get_data(
+    exe, size, result[0].addr)
+
+
+# ############################################################
+#
+#                      Code execution
+#
+# ############################################################
diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim
@@ -7,15 +7,15 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  constantine/platforms/llvm/bindings/nvidia_abi {.all.},
-  constantine/platforms/llvm/bindings/c_abi,
+  constantine/platforms/abis/nvidia_abi {.all.},
+  constantine/platforms/abis/c_abi,
   constantine/platforms/llvm/[llvm, nvidia_inlineasm],
   constantine/platforms/primitives,
   ./ir
 
 export
   nvidia_abi, nvidia_inlineasm,
-  Flag, flag
+  Flag, flag, wrapOpenArrayLenType
 
 # ############################################################
 #
@@ -131,7 +131,7 @@ proc tagCudaKernel(module: ModuleRef, fn: FnDef) =
     ]))
   )
 
-proc setCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
+proc wrapInCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
   ## Create a public wrapper of a cuda device function
   ##
   ## A function named `addmod` can be found by appending _public
@@ -202,7 +202,7 @@ proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): str
     errMsg.dispose()
     quit 1
 
-  return machine.emitToString(asy.module, AssemblyFile)
+  return machine.emitTo[:string](asy.module, AssemblyFile)
 
 # ############################################################
 #
@@ -263,4 +263,4 @@ proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =
 
   check cuMemFree(rGPU)
   check cuMemFree(aGPU)
-  check cuMemFree(bGPU)
+  check cuMemFree(bGPU)