Skip to content

Commit

Permalink
update Nvidia multiplication
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Aug 27, 2024
1 parent 0919dc1 commit 263b601
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 17 deletions.
3 changes: 1 addition & 2 deletions constantine/math_compiler/impl_fields_globals.nim
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ proc getM0ninv*(asy: Assembler_LLVM, fd: FieldDescriptor): ValueRef =
fd.wordTy
)


return m0ninv
return asy.load2(fd.wordTy, m0ninv, "m0ninv")

when isMainModule:
let asy = Assembler_LLVM.new("test_module", bkX86_64_Linux)
Expand Down
4 changes: 3 additions & 1 deletion constantine/math_compiler/impl_fields_nvidia.nim
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ proc mtymul_CIOS_sparebit(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M:
## with parameters `a, b, modulus: Limbs -> Limbs`

let name =
if finalReduce and fd.spareBits >= 2:
if not finalReduce and fd.spareBits >= 2:
"_mty_mulur.u" & $fd.w & "x" & $fd.numWords & "b2"
else:
doAssert fd.spareBits >= 1
Expand Down Expand Up @@ -338,6 +338,8 @@ proc mtymul_CIOS_sparebit(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M:
asy.store(r, t)
asy.br.retVoid()

asy.callFn(name, [r, a, b, M])

proc mtymul_nvidia(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef, finalReduce = true) {.used.} =
## Generate an optimized modular addition kernel
## with parameters `a, b, modulus: Limbs -> Limbs`
Expand Down
32 changes: 18 additions & 14 deletions tests/gpu/t_nvidia_fp.nim
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import
constantine/platforms/static_for,
constantine/named/algebras,
constantine/math/arithmetic,
constantine/math/io/io_bigints,
constantine/math/io/[io_bigints, io_fields],
constantine/math_compiler/[ir, pub_fields, codegen_nvidia],
# Test utilities
helpers/prng_unsafe
Expand Down Expand Up @@ -48,8 +48,6 @@ template gen_binop_test(
proc testName[Name: static Algebra](field: type FF[Name], wordSize: int) =
# Codegen
# -------------------------
static: debugEcho field

let name = if field is Fp: $Name & "_fp"
else: $Name & "_fr"
let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & name & $wordSize))
Expand Down Expand Up @@ -89,7 +87,7 @@ template gen_binop_test(

gen_binop_test(t_field_add, genFpAdd, sum)
gen_binop_test(t_field_sub, genFpSub, diff)
# gen_binop_test(t_field_mul, genFpMul, prod)
gen_binop_test(t_field_mul, genFpMul, prod)

proc main() =
const curves = [
Expand All @@ -115,20 +113,26 @@ proc main() =
t_field_add(Fp[curve], wordSize)
test "Nvidia GPU field substraction 𝔽p " & $wordSize & "-bit for " & $curve:
t_field_sub(Fp[curve], wordSize)
# test "Nvidia GPU field multiplication 𝔽p " & $wordSize & "-bit for " & $curve:
# # 64-bit integer fused-multiply-add with carry is buggy:
# # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
# # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
# t_field_mul(Fp[curve], wordSize)
test "Nvidia GPU field multiplication 𝔽p " & $wordSize & "-bit for " & $curve:
if wordSize == 64:
skip()
# 64-bit integer fused-multiply-add with carry is buggy:
# https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
# https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
else:
t_field_mul(Fp[curve], wordSize)

test "Nvidia GPU field addition 𝔽r " & $wordSize & "-bit for " & $curve:
t_field_add(Fr[curve], wordSize)
test "Nvidia GPU field substraction 𝔽r " & $wordSize & "-bit for " & $curve:
t_field_sub(Fr[curve], wordSize)
# test "Nvidia GPU field multiplication 𝔽r " & $wordSize & "-bit for " & $curve:
# # 64-bit integer fused-multiply-add with carry is buggy:
# # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
# # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
# t_field_mul(Fr[curve], wordSize)
test "Nvidia GPU field multiplication 𝔽r " & $wordSize & "-bit for " & $curve:
if wordSize == 64:
skip()
# 64-bit integer fused-multiply-add with carry is buggy:
# https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
# https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
else:
t_field_mul(Fr[curve], wordSize)

main()

0 comments on commit 263b601

Please sign in to comment.