update Nvidia multiplication

mratsim · Aug 27, 2024 · 263b601 · 263b601
1 parent 0919dc1
commit 263b601
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 17 deletions.
diff --git a/constantine/math_compiler/impl_fields_globals.nim b/constantine/math_compiler/impl_fields_globals.nim
@@ -157,8 +157,7 @@ proc getM0ninv*(asy: Assembler_LLVM, fd: FieldDescriptor): ValueRef =
         fd.wordTy
       )
 
-
-  return m0ninv
+  return asy.load2(fd.wordTy, m0ninv, "m0ninv")
 
 when isMainModule:
   let asy = Assembler_LLVM.new("test_module", bkX86_64_Linux)

diff --git a/constantine/math_compiler/impl_fields_nvidia.nim b/constantine/math_compiler/impl_fields_nvidia.nim
@@ -192,7 +192,7 @@ proc mtymul_CIOS_sparebit(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M:
   ## with parameters `a, b, modulus: Limbs -> Limbs`
 
   let name =
-    if finalReduce and fd.spareBits >= 2:
+    if not finalReduce and fd.spareBits >= 2:
       "_mty_mulur.u" & $fd.w & "x" & $fd.numWords & "b2"
     else:
       doAssert fd.spareBits >= 1
@@ -338,6 +338,8 @@ proc mtymul_CIOS_sparebit(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M:
     asy.store(r, t)
     asy.br.retVoid()
 
+  asy.callFn(name, [r, a, b, M])
+
 proc mtymul_nvidia(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef, finalReduce = true) {.used.} =
   ## Generate an optimized modular addition kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`

diff --git a/tests/gpu/t_nvidia_fp.nim b/tests/gpu/t_nvidia_fp.nim
@@ -15,7 +15,7 @@ import
   constantine/platforms/static_for,
   constantine/named/algebras,
   constantine/math/arithmetic,
-  constantine/math/io/io_bigints,
+  constantine/math/io/[io_bigints, io_fields],
   constantine/math_compiler/[ir, pub_fields, codegen_nvidia],
   # Test utilities
   helpers/prng_unsafe
@@ -48,8 +48,6 @@ template gen_binop_test(
   proc testName[Name: static Algebra](field: type FF[Name], wordSize: int) =
     # Codegen
     # -------------------------
-    static: debugEcho field
-
     let name = if field is Fp: $Name & "_fp"
               else: $Name & "_fr"
     let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & name & $wordSize))
@@ -89,7 +87,7 @@ template gen_binop_test(
 
 gen_binop_test(t_field_add, genFpAdd, sum)
 gen_binop_test(t_field_sub, genFpSub, diff)
-# gen_binop_test(t_field_mul, genFpMul, prod)
+gen_binop_test(t_field_mul, genFpMul, prod)
 
 proc main() =
   const curves = [
@@ -115,20 +113,26 @@ proc main() =
           t_field_add(Fp[curve], wordSize)
         test "Nvidia GPU field substraction 𝔽p " & $wordSize & "-bit for " & $curve:
           t_field_sub(Fp[curve], wordSize)
-        # test "Nvidia GPU field multiplication 𝔽p " & $wordSize & "-bit for " & $curve:
-        #   # 64-bit integer fused-multiply-add with carry is buggy:
-        #   # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
-        #   # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
-        #   t_field_mul(Fp[curve], wordSize)
+        test "Nvidia GPU field multiplication 𝔽p " & $wordSize & "-bit for " & $curve:
+          if wordSize == 64:
+            skip()
+            # 64-bit integer fused-multiply-add with carry is buggy:
+            # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
+            # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
+          else:
+            t_field_mul(Fp[curve], wordSize)
 
         test "Nvidia GPU field addition 𝔽r " & $wordSize & "-bit for " & $curve:
           t_field_add(Fr[curve], wordSize)
         test "Nvidia GPU field substraction 𝔽r " & $wordSize & "-bit for " & $curve:
           t_field_sub(Fr[curve], wordSize)
-        # test "Nvidia GPU field multiplication 𝔽r " & $wordSize & "-bit for " & $curve:
-        #   # 64-bit integer fused-multiply-add with carry is buggy:
-        #   # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
-        #   # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
-        #   t_field_mul(Fr[curve], wordSize)
+        test "Nvidia GPU field multiplication 𝔽r " & $wordSize & "-bit for " & $curve:
+          if wordSize == 64:
+            skip()
+            # 64-bit integer fused-multiply-add with carry is buggy:
+            # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
+            # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
+          else:
+            t_field_mul(Fr[curve], wordSize)
 
 main()