From 432a91e3a935915c91e2946dd4144ee1cf10cb07 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Mon, 5 Aug 2024 22:25:03 +0200
Subject: [PATCH] feat(LLVM): add codegenerator for saturated field add/sub

---
 PLANNING.md                                   |   8 +
 .../assembly/limbs_asm_modular_x86.nim        |   2 +-
 constantine/math_compiler/codegen_nvidia.nim  |   4 +-
 .../math_compiler/impl_fields_nvidia.nim      |  13 +-
 constantine/math_compiler/impl_fields_sat.nim | 166 ++++++++++
 constantine/math_compiler/ir.nim              |   6 +
 constantine/platforms/abis/llvm_abi.nim       |  32 +-
 .../extended_precision_64bit_uint128.nim      |   4 +-
 .../extended_precision_x86_64_msvc.nim        |   6 +-
 .../{nvidia_inlineasm.nim => asm_nvidia.nim}  |   0
 .../platforms/llvm/asm_x86.nim                |   0
 constantine/platforms/llvm/llvm.nim           |  35 ++-
 .../platforms/llvm/super_instructions.nim     | 232 ++++++++++++++
 research/codegen/x86_instr.nim                |  96 ------
 research/codegen/x86_poc.nim                  | 287 ++----------------
 15 files changed, 513 insertions(+), 378 deletions(-)
 create mode 100644 constantine/math_compiler/impl_fields_sat.nim
 rename constantine/platforms/llvm/{nvidia_inlineasm.nim => asm_nvidia.nim} (100%)
 rename research/codegen/x86_inlineasm.nim => constantine/platforms/llvm/asm_x86.nim (100%)
 create mode 100644 constantine/platforms/llvm/super_instructions.nim
 delete mode 100644 research/codegen/x86_instr.nim

diff --git a/PLANNING.md b/PLANNING.md
index 7fd6f956..b4258d26 100644
--- a/PLANNING.md
+++ b/PLANNING.md
@@ -101,6 +101,14 @@ Other tracks are stretch goals, contributions towards them are accepted.
 - introduce batchAffine_vartime
 - Optimized square_repeated in assembly for Montgomery and Crandall/Pseudo-Mersenne primes
 - Optimized elliptic curve directly calling assembly without ADX checks and limited input/output movement in registers or using function multi-versioning.
+- LLVM IR:
+  - use internal or private linkage type
+  - look into calling conventions like "fast" or "Tail fast"
+  - check if returning a value from function is propely optimized
+    compared to in-place result
+  - use readnone (pure) and readmem attribute for functions
+  - look into passing parameter as arrays instead of pointers?
+  - use hot function attribute
 
 ### User Experience track
 
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
index 12da5fcd..0db85007 100644
--- a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
@@ -80,7 +80,7 @@ proc finalSubMayOverflowImpl*(
       ctx.mov scratch[i], a[i]
     ctx.sbb scratch[i], M[i]
 
-  # If it overflows here, it means that it was
+  # If it underflows here, it means that it was
   # smaller than the modulus and we don't need `scratch`
   ctx.sbb scratchReg, 0
 
diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim
index 19e92019..63245f95 100644
--- a/constantine/math_compiler/codegen_nvidia.nim
+++ b/constantine/math_compiler/codegen_nvidia.nim
@@ -9,12 +9,12 @@
 import
   constantine/platforms/abis/nvidia_abi {.all.},
   constantine/platforms/abis/c_abi,
-  constantine/platforms/llvm/[llvm, nvidia_inlineasm],
+  constantine/platforms/llvm/llvm,
   constantine/platforms/primitives,
   ./ir
 
 export
-  nvidia_abi, nvidia_inlineasm,
+  nvidia_abi,
   Flag, flag, wrapOpenArrayLenType
 
 # ############################################################
diff --git a/constantine/math_compiler/impl_fields_nvidia.nim b/constantine/math_compiler/impl_fields_nvidia.nim
index 0ffbb5b1..6b034701 100644
--- a/constantine/math_compiler/impl_fields_nvidia.nim
+++ b/constantine/math_compiler/impl_fields_nvidia.nim
@@ -7,8 +7,8 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ../platforms/llvm/llvm,
-  ./ir, ./codegen_nvidia
+  constantine/platforms/llvm/[llvm, asm_nvidia],
+  ./ir
 
 # ############################################################
 #
@@ -40,8 +40,11 @@ import
 # but the carry codegen of madc.hi.cc.u64 has off-by-one
 # - https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
 # - old 32-bit bug: https://forums.developer.nvidia.com/t/wrong-result-returned-by-madc-hi-u64-ptx-instruction-for-specific-operands/196094
+#
+# See instruction throughput
+# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions
 
-proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
+proc finalSubMayOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
   ## If a >= Modulus: r <- a-M
   ## else:            r <- a
   ##
@@ -74,7 +77,7 @@ proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field,
   for i in 0 ..< N:
     r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
 
-proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
+proc finalSubNoOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
   ## If a >= Modulus: r <- a-M
   ## else:            r <- a
   ##
@@ -354,4 +357,4 @@ proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field:
 proc field_mul_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub = false): FnDef =
   ## Generate an optimized modular addition kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
-  return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub)
\ No newline at end of file
+  return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub)
diff --git a/constantine/math_compiler/impl_fields_sat.nim b/constantine/math_compiler/impl_fields_sat.nim
new file mode 100644
index 00000000..4910548d
--- /dev/null
+++ b/constantine/math_compiler/impl_fields_sat.nim
@@ -0,0 +1,166 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/llvm/[llvm, super_instructions],
+  ./ir, ./codegen_nvidia
+
+# ############################################################
+#
+#             Field arithmetic with saturated limbs
+#
+# ############################################################
+#
+# This implements field operations in pure LLVM
+# using saturated limbs, i.e. 64-bit words on 64-bit platforms.
+#
+# This relies on hardware addition-with-carry and substraction-with-borrow
+# for efficiency.
+#
+# As such it is not suitable for platforms with no carry flags such as:
+# - WASM
+# - MIPS
+# - RISC-V
+# - Metal
+#
+# It may be suitable for Intel GPUs as the virtual ISA does support add-carry
+#
+# It is suitable for:
+# - ARM
+# - AMD GPUs (for prototyping)
+#
+# The following backends have better optimizations through assembly:
+# - x86: access to ADOX and ADCX interleaved double-carry chain
+# - Nvidia: access to multiply accumulate instruction
+#           and non-interleaved double-carry chain
+#
+# AMD GPUs may benefits from using 24-bit limbs
+# - https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/programmer-references/AMD_OpenCL_Programming_Optimization_Guide2.pdf
+#   p2-23:
+#  Generally, the throughput and latency for 32-bit integer operations is the same
+#  as for single-precision floating point operations.
+#  24-bit integer MULs and MADs have four times the throughput of 32-bit integer
+#  multiplies. 24-bit signed and unsigned integers are natively supported on the
+#  GCN family of devices. The use of OpenCL built-in functions for mul24 and mad24
+#  is encouraged. Note that mul24 can be useful for array indexing operations
+#  Doc from 2015, it might not apply to RDNA family
+# - https://free.eol.cn/edu_net/edudown/AMDppt/OpenCL%20Programming%20and%20Optimization%20-%20Part%20I.pdf
+#   slide 24
+#
+# - https://chipsandcheese.com/2023/01/07/microbenchmarking-amds-rdna-3-graphics-architecture/
+#   "Since Turing, Nvidia also achieves very good integer multiplication performance.
+#    Integer multiplication appears to be extremely rare in shader code,
+#    and AMD doesn’t seem to have optimized for it.
+#    32-bit integer multiplication executes at around a quarter of FP32 rate,
+#    and latency is pretty high too."
+
+proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array, carry: ValueRef) =
+  ## If a >= Modulus: r <- a-M
+  ## else:            r <- a
+  ##
+  ## This is constant-time straightline code.
+  ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+  ##
+  ## To be used when the final substraction can
+  ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
+
+  let bld = asy.builder
+  let fieldTy = cm.getFieldType(field)
+  let wordTy = cm.getWordType(field)
+  let scratch = bld.makeArray(fieldTy)
+  let M = cm.getModulus(field)
+  let N = M.len
+
+  let zero_i1 = constInt(asy.i1_t, 0)
+  let zero = constInt(wordTy, 0)
+
+  # Mask: contains 0xFFFF or 0x0000
+  let (_, mask) = bld.subborrow(zero, zero, carry)
+
+  # Now substract the modulus, and test a < M
+  # (underflow) with the last borrow
+  var b: ValueRef
+  (b, scratch[0]) = bld.subborrow(a[0], M[0], zero_i1)
+  for i in 1 ..< N:
+    (b, scratch[i]) = bld.subborrow(a[i], M[i], b)
+
+  # If it underflows here, it means that it was
+  # smaller than the modulus and we don't need `scratch`
+  (b, _) = bld.subborrow(mask, zero, b)
+
+  for i in 0 ..< N:
+    r[i] = bld.select(b, a[i], scratch[i])
+
+proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
+  ## If a >= Modulus: r <- a-M
+  ## else:            r <- a
+  ##
+  ## This is constant-time straightline code.
+  ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+  ##
+  ## To be used when the modulus does not use the full bitwidth of the storing words
+  ## (say using 255 bits for the modulus out of 256 available in words)
+
+  let bld = asy.builder
+  let fieldTy = cm.getFieldType(field)
+  let scratch = bld.makeArray(fieldTy)
+  let M = cm.getModulus(field)
+  let N = M.len
+
+  # Now substract the modulus, and test a < M with the last borrow
+  let zero_i1 = constInt(asy.i1_t, 0)
+  var b: ValueRef
+  (b, scratch[0]) = bld.subborrow(a[0], M[0], zero_i1)
+  for i in 1 ..< N:
+    (b, scratch[i]) = bld.subborrow(a[i], M[i], b)
+
+  # If it underflows here a was smaller than the modulus, which is what we want
+  for i in 0 ..< N:
+    r[i] = bld.select(b, a[i], scratch[i])
+
+proc field_add_gen_sat*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
+  ## Generate an optimized modular addition kernel
+  ## with parameters `a, b, modulus: Limbs -> Limbs`
+
+  let procName = cm.genSymbol(block:
+    case field
+    of fp: opFpAdd
+    of fr: opFrAdd)
+  let fieldTy = cm.getFieldType(field)
+  let pFieldTy = pointer_t(fieldTy)
+
+  let addModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy])
+  let addModKernel = asy.module.addFunction(cstring procName, addModTy)
+  let blck = asy.ctx.appendBasicBlock(addModKernel, "addModSatBody")
+  asy.builder.positionAtEnd(blck)
+
+  let bld = asy.builder
+
+  let r = bld.asArray(addModKernel.getParam(0), fieldTy)
+  let a = bld.asArray(addModKernel.getParam(1), fieldTy)
+  let b = bld.asArray(addModKernel.getParam(2), fieldTy)
+
+  let t = bld.makeArray(fieldTy)
+  let N = cm.getNumWords(field)
+
+  var c: ValueRef
+  let zero = constInt(asy.i1_t, 0)
+
+  (c, t[0]) = bld.addcarry(a[0], b[0], zero)
+  for i in 1 ..< N:
+    (c, t[i]) = bld.addcarry(a[i], b[i], c)
+
+  if cm.getSpareBits(field) >= 1:
+    asy.finalSubNoOverflow(cm, field, t, t)
+  else:
+    asy.finalSubMayOverflow(cm, field, t, t, c)
+
+  bld.store(r, t)
+  bld.retVoid()
+
+  return (addModTy, addModKernel)
diff --git a/constantine/math_compiler/ir.nim b/constantine/math_compiler/ir.nim
index 1523fdab..23464a72 100644
--- a/constantine/math_compiler/ir.nim
+++ b/constantine/math_compiler/ir.nim
@@ -265,6 +265,12 @@ func getFieldType*(cm: CurveMetadata, field: Field): TypeRef {.inline.} =
   else:
     return cm.fr.fieldTy
 
+func getWordType*(cm: CurveMetadata, field: Field): TypeRef {.inline.} =
+  if field == fp:
+    return cm.fp.wordTy
+  else:
+    return cm.fr.wordTy
+
 func getNumWords*(cm: CurveMetadata, field: Field): int {.inline.} =
   case field
   of fp:
diff --git a/constantine/platforms/abis/llvm_abi.nim b/constantine/platforms/abis/llvm_abi.nim
index 7fcf342f..974e29c1 100644
--- a/constantine/platforms/abis/llvm_abi.nim
+++ b/constantine/platforms/abis/llvm_abi.nim
@@ -601,17 +601,17 @@ type
     ##  An instruction builder represents a point within a basic block and is
     ##  the exclusive means of building instructions using the C interface.
 
-  IntPredicate* {.size: sizeof(cint).} = enum
-    IntEQ = 32               ## equal
-    IntNE                    ## not equal
-    IntUGT                   ## unsigned greater than
-    IntUGE                   ## unsigned greater or equal
-    IntULT                   ## unsigned less than
-    IntULE                   ## unsigned less or equal
-    IntSGT                   ## signed greater than
-    IntSGE                   ## signed greater or equal
-    IntSLT                   ## signed less than
-    IntSLE                   ## signed less or equal
+  Predicate* {.size: sizeof(cint).} = enum
+    kEQ = 32               ## equal
+    kNE                    ## not equal
+    kUGT                   ## unsigned greater than
+    kUGE                   ## unsigned greater or equal
+    kULT                   ## unsigned less than
+    kULE                   ## unsigned less or equal
+    kSGT                   ## signed greater than
+    kSGE                   ## signed greater or equal
+    kSLT                   ## signed less than
+    kSLE                   ## signed less or equal
 
   InlineAsmDialect* {.size: sizeof(cint).} = enum
     InlineAsmDialectATT
@@ -675,19 +675,27 @@ proc call2*(
 
 proc add*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAdd".}
 proc addNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWAdd".}
+  ## Addition No Signed Wrap, i.e. guaranteed to not overflow
 proc addNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWAdd".}
+  ## Addition No Unsigned Wrap, i.e. guaranteed to not overflow
 
 proc sub*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSub".}
 proc subNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWSub".}
+  ## Substraction No Signed Wrap, i.e. guaranteed to not overflow
 proc subNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWSub".}
+  ## Substraction No Unsigned Wrap, i.e. guaranteed to not overflow
 
 proc neg*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNeg".}
 proc negNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWNeg".}
+  ## Negation No Signed Wrap, i.e. guaranteed to not overflow
 proc negNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWNeg".}
+  ## Negation No Unsigned Wrap, i.e. guaranteed to not overflow
 
 proc mul*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildMul".}
 proc mulNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWMul".}
+  ## Multiplication No Signed Wrap, i.e. guaranteed to not overflow
 proc mulNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWMul".}
+  ## Multiplication No Unsigned Wrap, i.e. guaranteed to not overflow
 
 proc divU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildUDiv".}
 proc divU_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildExactUDiv".}
@@ -706,7 +714,7 @@ proc `xor`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueR
 proc `not`*(builder: BuilderRef, val: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNot".}
 proc select*(builder: BuilderRef, condition, then, otherwise: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSelect".}
 
-proc icmp*(builder: BuilderRef, op: IntPredicate, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildICmp".}
+proc icmp*(builder: BuilderRef, op: Predicate, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildICmp".}
 
 proc bitcast*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildBitcast".}
 proc trunc*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildTrunc".}
diff --git a/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim b/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
index 345acd8f..f0e438a2 100644
--- a/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
+++ b/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
@@ -85,7 +85,7 @@ func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
       {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
 
 func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
-  ## Extended precision multiplication
+  ## Signed extended precision multiplication
   ## (hi, lo) <- a*b
   ##
   ## Inputs are intentionally unsigned
@@ -103,4 +103,4 @@ func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
       {.emit:[lo, " = (NU64)", dblPrec,";"].}
     else:
       {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
-      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
\ No newline at end of file
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
diff --git a/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim b/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
index 3216b859..06bde04d 100644
--- a/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
+++ b/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
@@ -77,12 +77,12 @@ func smul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_mul12
   ## as we use their unchecked raw representation for cryptography
 
 func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
-  ## Extended precision multiplication
+  ## Signed extended precision multiplication
   ## (hi, lo) <- a*b
   ##
   ## Inputs are intentionally unsigned
   ## as we use their unchecked raw representation for cryptography
-  ## 
+  ##
   ## This is constant-time on most hardware
   ## See: https://www.bearssl.org/ctmul.html
-  lo = smul128(a, b, hi)
\ No newline at end of file
+  lo = smul128(a, b, hi)
diff --git a/constantine/platforms/llvm/nvidia_inlineasm.nim b/constantine/platforms/llvm/asm_nvidia.nim
similarity index 100%
rename from constantine/platforms/llvm/nvidia_inlineasm.nim
rename to constantine/platforms/llvm/asm_nvidia.nim
diff --git a/research/codegen/x86_inlineasm.nim b/constantine/platforms/llvm/asm_x86.nim
similarity index 100%
rename from research/codegen/x86_inlineasm.nim
rename to constantine/platforms/llvm/asm_x86.nim
diff --git a/constantine/platforms/llvm/llvm.nim b/constantine/platforms/llvm/llvm.nim
index d222a306..38addee5 100644
--- a/constantine/platforms/llvm/llvm.nim
+++ b/constantine/platforms/llvm/llvm.nim
@@ -175,6 +175,12 @@ proc function_t*(returnType: TypeRef, paramTypes: openArray[TypeRef]): TypeRef {
 # Values
 # ------------------------------------------------------------
 
+# TODO: remove ConstValueRef
+# - This is used in `selConstraint` in asm_nvidia
+#   to choose the `n` literal constraint.
+#   Instead of inlining as literal and hurting instruction decoding
+#   with large 8 bytes value, we load from const memory.
+
 type
   ConstValueRef* = distinct ValueRef
   AnyValueRef* = ValueRef or ConstValueRef
@@ -186,7 +192,34 @@ proc getName*(v: ValueRef): string =
   result = newString(rLen.int)
   copyMem(result[0].addr, rStr, rLen.int)
 
-proc constInt*(ty: TypeRef, n: uint64, signExtend = false): ConstValueRef {.inline.} =
+proc constInt*(ty: TypeRef, n: SomeInteger, signExtend = false): ConstValueRef {.inline.} =
   ConstValueRef constInt(ty, culonglong(n), LlvmBool(signExtend))
 
 proc getTypeOf*(v: ConstValueRef): TypeRef {.borrow.}
+proc zext*(builder: BuilderRef, val: ConstValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.inline.} =
+  ## Zero-extend
+  builder.zext(ValueRef val, destTy, name)
+proc sext*(builder: BuilderRef, val: ConstValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.inline.} =
+  ## Sign-extend
+  builder.sext(ValueRef val, destTy, name)
+
+proc add*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} =
+  builder.add(ValueRef lhs, ValueRef rhs, name)
+proc addNSW*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} =
+  ## Addition No Signed Wrap, i.e. guaranteed to not overflow
+  builder.addNSW(ValueRef lhs, ValueRef rhs, name)
+proc addNUW*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} =
+  ## Addition No Unsigned Wrap, i.e. guaranteed to not overflow
+  builder.addNUW(ValueRef lhs, ValueRef rhs, name)
+
+proc sub*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} =
+  builder.sub(ValueRef lhs, ValueRef rhs, name)
+proc subNSW*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} =
+  ## Substraction No Signed Wrap, i.e. guaranteed to not overflow
+  builder.subNSW(ValueRef lhs, ValueRef rhs, name)
+proc subNUW*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} =
+  ## Substraction No Unsigned Wrap, i.e. guaranteed to not overflow
+  builder.subNUW(ValueRef lhs, ValueRef rhs, name)
+
+proc icmp*(builder: BuilderRef, op: Predicate, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} =
+  builder.icmp(op, ValueRef lhs, ValueRef rhs, name)
diff --git a/constantine/platforms/llvm/super_instructions.nim b/constantine/platforms/llvm/super_instructions.nim
new file mode 100644
index 00000000..f090785e
--- /dev/null
+++ b/constantine/platforms/llvm/super_instructions.nim
@@ -0,0 +1,232 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./llvm
+
+# ############################################################
+#
+#        LLVM IR super-instructions
+#
+# ############################################################
+
+# This defines a collection of LLVM IR super-instructions
+# Ideally those super-instructions compile-down
+# to ISA optimized single instructions
+#
+# To ensure this, tests can be consulted at:
+#   https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/
+
+# Add-carry:
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/add-of-carry.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/addcarry.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/addcarry2.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/adx-intrinsics.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/adx-intrinsics-upgrade.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/apx/adc.ll
+#
+# Sub-borrow
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/sub-with-overflow.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/cgp-usubo.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/apx/sbb.ll
+#
+# Multiplication
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/mulx32.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/mulx64.ll
+
+# Warning:
+#
+#   There is no guarantee of constant-time with LLVM IR
+#   It MAY introduce branches.
+#   For workload that involves private keys or secrets
+#   assembly MUST be used
+#
+#   Alternatively an assembly source file must be generated
+#   and checked in the repo to avoid regressions should
+#   the compiler "progress"
+#
+#   - https://github.com/mratsim/constantine/wiki/Constant-time-arithmetics#fighting-the-compiler
+#   - https://blog.cr.yp.to/20240803-clang.html
+#   - https://www.cl.cam.ac.uk/~rja14/Papers/whatyouc.pdf
+
+proc hi(bld: BuilderRef, val: ValueRef, baseTy: TypeRef, oversize: uint32, prefix: string): ValueRef =
+  let ctx = bld.getContext()
+  let bits = baseTy.getIntTypeWidth()
+  let overTy = ctx.int_t(bits + oversize)
+
+  # %hi_shift_1 = zext i8 64 to i128
+  let s = constInt(ctx.int8_t(), oversize)
+  let shift = bld.zext(s, overTy, name = cstring(prefix & "S_"))
+  # %hiLarge_1 = lshr i128 %input, %hi_shift_1
+  let hiLarge = bld.lshr(val, shift, name = cstring(prefix & "L_"))
+  # %hi_1 = trunc i128 %hiLarge_1 to i64
+  let hi = bld.trunc(hiLarge, baseTy, name = cstring(prefix & "_"))
+
+  return hi
+
+proc addcarry*(bld: BuilderRef, a, b, carryIn: distinct AnyValueRef): tuple[carryOut, r: ValueRef] =
+  ## (cOut, result) <- a+b+cIn
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+
+  let add = bld.add(a, b, name = "adc01_")
+  let carry0 = bld.icmp(kULT, add, b, name = "adc01c_")
+  let cIn = bld.zext(carryIn, ty, name = "adc2_")
+  let adc = bld.add(cIn, add, name = "adc_")
+  let carry1 = bld.icmp(kULT, adc, add, name = "adc012c_")
+  let carryOut = bld.`or`(carry0, carry1, name = "cOut_")
+
+  return (carryOut, adc)
+
+proc subborrow*(bld: BuilderRef, a, b, borrowIn: distinct AnyValueRef): tuple[borrowOut, r: ValueRef] =
+  ## (bOut, result) <- a-b-bIn
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+
+  let sub = bld.sub(a, b, name = "sbb01_")
+  let borrow0 = bld.icmp(kULT, a, b, name = "sbb01b_")
+  let bIn = bld.zext(borrowIn, ty, name = "sbb2_")
+  let sbb = bld.sub(sub, bIn, name = "sbb_")
+  let borrow1 = bld.icmp(kULT, sub, bIn, name = "sbb012b_")
+  let borrowOut = bld.`or`(borrow0, borrow1, name = "bOut_")
+
+  return (borrowOut, sbb)
+
+proc mulExt*(bld: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "mulx0_")
+  let b = bld.zext(b, dblTy, name = "mulx1_")
+  let r = bld.mulNUW(a, b, name = "mulx_")
+
+  let lo = bld.trunc(r, ty, name = "mullo_")
+  let hi = bld.hi(r, ty, oversize = bits, prefix = "mulhi_")
+  return (hi, lo)
+
+proc smulExt*(bld: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] =
+  ## Signed extended precision multiplication
+  ## (hi, lo) <- a*b
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.sext(a, dblTy, name = "smulx0_")
+  let b = bld.sext(b, dblTy, name = "smulx1_")
+  let r = bld.mulNSW(a, b, name = "smulx0_")
+
+  let lo = bld.trunc(r, ty, name = "smullo_")
+  let hi = bld.hi(r, ty, oversize = bits, prefix = "smulhi_")
+  return (hi, lo)
+
+proc muladd1*(bld: BuilderRef, a, b, c: ValueRef): tuple[hi, lo: ValueRef] =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding any c cannot overflow
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "fmax0_")
+  let b = bld.zext(b, dblTy, name = "fmax1_")
+  let ab = bld.mulNUW(a, b, name = "fmax01_")
+
+  let c = bld.zext(c, dblTy, name = "fmax2_")
+  let r = bld.addNUW(ab, c, name = "fmax_")
+
+  let lo = bld.trunc(r, ty, name = "fmalo_")
+  let hi = bld.hi(r, ty, oversize = bits, prefix = "fmahi_")
+  return (hi, lo)
+
+proc muladd2*(bld: BuilderRef, a, b, c1, c2: ValueRef): tuple[hi, lo: ValueRef] =
+  ## Extended precision multiplication + addition + addition
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)
+  ##       and we have enough space to add again 0xFFFFFFFF without overflowing
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "fmaa0_")
+  let b = bld.zext(b, dblTy, name = "fmaa1_")
+  let ab = bld.mulNUW(a, b, name = "fmaa01_")
+
+  let c1 = bld.zext(c1, dblTy, name = "fmaa2_")
+  let abc1 = bld.addNUW(ab, c1, name = "fmaa012_")
+  let c2 = bld.zext(c2, dblTy, name = "fmaa3_")
+  let r = bld.addNUW(abc1, c2, name = "fmaa_")
+
+  let lo = bld.trunc(r, ty, name = "fmaalo_")
+  let hi = bld.hi(r, ty, oversize = bits, prefix = "fmaahi_")
+  return (hi, lo)
+
+proc mulAcc*(bld: BuilderRef, tuv: var ValueRef, a, b: ValueRef) =
+  ## (t, u, v) <- (t, u, v) + a * b
+  let ctx = bld.getContext()
+
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+
+  let x3ty = tuv.getTypeOf()
+  let x3bits = x3ty.getIntTypeWidth()
+
+  doAssert bits * 3 == x3bits
+
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "mac0_")
+  let b = bld.zext(b, dblTy, name = "mac1_")
+  let ab = bld.mulNUW(a, b, name = "mac01_")
+
+  let wide_ab = bld.zext(ab, x3ty, name = "mac01x_")
+  let r = bld.addNUW(tuv, wide_ab, "mac_")
+
+  tuv = r
+
+proc mulDoubleAcc*(bld: BuilderRef, tuv: var ValueRef, a, b: ValueRef) =
+  ## (t, u, v) <- (t, u, v) + 2 * a * b
+  let ctx = bld.getContext()
+
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+
+  let x3ty = tuv.getTypeOf()
+  let x3bits = x3ty.getIntTypeWidth()
+
+  doAssert bits * 3 == x3bits
+
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "macd0_")
+  let b = bld.zext(b, dblTy, name = "macd1_")
+  let ab = bld.mulNUW(a, b, name = "macd01_")
+
+  let wide_ab = bld.zext(ab, x3ty, name = "macd01x_")
+  let r1 = bld.addNUW(tuv, wide_ab, "macdpart_")
+  let r2 = bld.addNUW(r1, wide_ab, "macd_")
+
+  tuv = r2
diff --git a/research/codegen/x86_instr.nim b/research/codegen/x86_instr.nim
deleted file mode 100644
index a4a19219..00000000
--- a/research/codegen/x86_instr.nim
+++ /dev/null
@@ -1,96 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  constantine/platforms/abis/c_abi,
-  constantine/platforms/llvm/llvm,
-  constantine/platforms/primitives,
-  constantine/math_compiler/ir,
-  ./x86_inlineasm
-
-export x86_inlineasm
-
-# ############################################################
-#
-#                     x86 API
-#
-# ############################################################
-
-proc defMulExt*(asy: Assembler_LLVM, wordSize: int): FnDef =
-
-  let procName = if wordSize == 64: cstring"hw_mulExt64"
-                 else: cstring"hw_mulExt32"
-
-  let doublePrec_t = if wordSize == 64: asy.i128_t
-                     else: asy.i64_t
-
-  let mulExtTy = if wordSize == 64: function_t(doublePrec_t, [asy.i64_t, asy.i64_t])
-                 else: function_t(doublePrec_t, [asy.i32_t, asy.i32_t])
-  let mulExtKernel = asy.module.addFunction(procName, mulExtTy)
-  let blck = asy.ctx.appendBasicBlock(mulExtKernel, "mulExtBody")
-  asy.builder.positionAtEnd(blck)
-
-  let bld = asy.builder
-
-  let a = bld.zext(mulExtKernel.getParam(0), doublePrec_t)
-  let b = bld.zext(mulExtKernel.getParam(1), doublePrec_t)
-  let r = bld.mul(a, b)
-
-  bld.ret r
-
-  return (mulExtTy, mulExtKernel)
-
-proc defHi*(asy: Assembler_LLVM, wordSize: int): FnDef =
-
-  let procName = if wordSize == 64: cstring"hw_hi64"
-                 else: cstring"hw_hi32"
-  let doublePrec_t = if wordSize == 64: asy.i128_t
-                     else: asy.i64_t
-  let singlePrec_t = if wordSize == 64: asy.i64_t
-                     else: asy.i32_t
-
-  let hiTy = function_t(singlePrec_t, [doublePrec_t])
-
-  let hiKernel = asy.module.addFunction(procName, hiTy)
-  let blck = asy.ctx.appendBasicBlock(hiKernel, "hiBody")
-  asy.builder.positionAtEnd(blck)
-
-  let bld = asy.builder
-
-  # %1 = zext i32 64 to i128
-  let shift = bld.zext(constInt(asy.i32_t, culonglong wordSize, signExtend = LlvmBool(0)), doublePrec_t)
-  # %hiLarge = lshr i128 %input, %1
-  let hiLarge = bld.lshr(hiKernel.getParam(0), shift)
-  # %hi = trunc i128 %hiLarge to i64
-  let hi = bld.trunc(hiLarge, singlePrec_t)
-
-  bld.ret hi
-
-  return (hiTy, hiKernel)
-
-proc defLo*(asy: Assembler_LLVM, wordSize: int): FnDef =
-
-  let procName = if wordSize == 64: cstring"hw_lo64"
-                 else: cstring"hw_lo32"
-  let doublePrec_t = if wordSize == 64: asy.i128_t
-                     else: asy.i64_t
-  let singlePrec_t = if wordSize == 64: asy.i64_t
-                     else: asy.i32_t
-
-  let loTy = function_t(singlePrec_t, [doublePrec_t])
-
-  let loKernel = asy.module.addFunction(procName, loTy)
-  let blck = asy.ctx.appendBasicBlock(loKernel, "loBody")
-  asy.builder.positionAtEnd(blck)
-
-  let bld = asy.builder
-
-  # %lo = trunc i128 %input to i64
-  let lo = bld.trunc(loKernel.getParam(0), singlePrec_t)
-  bld.ret lo
-  return (loTy, loKernel)
diff --git a/research/codegen/x86_poc.nim b/research/codegen/x86_poc.nim
index c5c376fe..a677158e 100644
--- a/research/codegen/x86_poc.nim
+++ b/research/codegen/x86_poc.nim
@@ -7,95 +7,33 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
+  constantine/named/algebras,
+  constantine/math/io/io_bigints,
+
   constantine/platforms/llvm/llvm,
   constantine/platforms/primitives,
-  constantine/math_compiler/ir,
-  ./x86_instr
-
-echo "LLVM JIT compiler: Multiplication with MULX/ADOX/ADCX"
-
-proc big_mul_gen(asy: Assembler_LLVM): FnDef =
-
-
-  let procName = "big_mul_64x4"
-  let N = 4
-  let ty = array_t(asy.i64_t, N)
-  let pty = pointer_t(ty)
-
-  let bigMulTy = function_t(asy.void_t, [pty, pty, pty])
-  let bigMulKernel = asy.module.addFunction(cstring procName, bigMulTy)
-  let blck = asy.ctx.appendBasicBlock(bigMulKernel, "bigMulBody")
-  asy.builder.positionAtEnd(blck)
-
-  let bld = asy.builder
-
-  let (hiTy, hiKernel) = asy.defHi(64)
-  proc hi(builder: BuilderRef, a: ValueRef): ValueRef =
-    return builder.call2(
-      hiTy, hiKernel,
-      [a], "hi64_"
-    )
-
-  let (loTy, loKernel) = asy.defLo(64)
-  proc lo(builder: BuilderRef, a: ValueRef): ValueRef =
-    return builder.call2(
-      loTy, loKernel,
-      [a], "lo64_"
-    )
-
-  let (mulExtTy, mulExtKernel) = asy.defMulExt(64)
-  bld.positionAtEnd(blck)
-
-  proc mulx(builder: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] =
-    # LLVM does not support multipel return value at the moment
-    # https://nondot.org/sabre/LLVMNotes/MultipleReturnValues.txt
-    # So we don't create an LLVM function
-    let t = builder.call2(
-      mulExtTy, mulExtKernel,
-      [a, b], "mulx64_"
-    )
-
-    builder.positionAtEnd(blck)
-    let lo = builder.lo(t)
-    let hi = builder.hi(t)
-    return (hi, lo)
-
-  let r = bld.asArray(bigMulKernel.getParam(0), ty)
-  let a = bld.asArray(bigMulKernel.getParam(1), ty)
-  let b = bld.asArray(bigMulKernel.getParam(2), ty)
-
-  let t = bld.makeArray(ty)
-
-  block: # i = 0
-    # TODO: properly implement add/adc in pure LLVM
-
-    # TODO: ensure flags are cleared properly, compiler might optimize this away
-    t[0] = bld.`xor`(t[0], t[0])
-    let (hi, lo) = bld.mulx(a[0], b[0])
-    r[0] = lo
-    t[0] = hi
-
-    for j in 1 ..< N:
-      let (hi , lo) = bld.mulx(a[j], b[0])
-      t[j] = hi
-      # SHOWSTOPPER: LLVM ERROR: Inline asm not supported by this streamer because we don't have an asm parser for this target
-      discard bld.adcx_rr(t[j-1], lo) # Replace by LLVM IR uadd_with_overflow
+  constantine/math_compiler/[ir, impl_fields_sat]
 
-    # SHOWSTOPPER: LLVM ERROR: Inline asm not supported by this streamer because we don't have an asm parser for this target
-    discard bld.adcx_rr(t[N-1], 0)
+proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Algebra, wordSize: WordSize): T =
+  CurveMetadata.init(
+      asy.ctx,
+      $curve & "_", wordSize,
+      fpBits = uint32 Fp[curve].bits(),
+      fpMod = Fp[curve].getModulus().toHex(),
+      frBits = uint32 Fr[curve].bits(),
+      frMod = Fr[curve].getModulus().toHex())
 
-  # TODO: rotate t array
+proc genFieldAddSat(asy: Assembler_LLVM, cm: CurveMetadata) =
+  let fpAdd = asy.field_add_gen_sat(cm, fp)
+  let frAdd = asy.field_add_gen_sat(cm, fr)
 
-  # TODO: impl i in 1 ..< N
 
-  bld.store(r, t)
-  bld.retVoid()
-  return (bigMulTy, bigMulKernel)
-
-when isMainModule:
-  # It's not the Nvidia PTX backend but it's fine
+proc t_field_add(curve: static Algebra) =
   let asy = Assembler_LLVM.new(bkX86_64_Linux, cstring("x86_poc"))
-  let bigMul = asy.big_mul_gen()
+  let cm32 = CurveMetadata.init(asy, curve, size32)
+  asy.genFieldAddSat(cm32)
+  let cm64 = CurveMetadata.init(asy, curve, size64)
+  asy.genFieldAddSat(cm64)
 
   asy.module.verify(AbortProcessAction)
 
@@ -105,26 +43,28 @@ when isMainModule:
   echo asy.module
   echo "========================================="
 
-
   var engine: ExecutionEngineRef
   initializeFullNativeTarget()
-  createJITCompilerForModule(engine, asy.module, optLevel = 0)
+  createJITCompilerForModule(engine, asy.module, optLevel = 3)
 
-  let jitMul = cast[proc(r: var array[4, uint64], a, b: array[4, uint64]){.noconv.}](
-    engine.getFunctionAddress("big_mul_64x4")
+  let fn32 = cm32.genSymbol(opFpAdd)
+  let fn64 = cm64.genSymbol(opFpAdd)
+
+  let jitFpAdd64 = cast[proc(r: var array[4, uint64], a, b: array[4, uint64]){.noconv.}](
+    engine.getFunctionAddress(cstring fn64)
   )
 
   var r: array[4, uint64]
-  r.jitMul([uint64 1, 2, 3, 4], [uint64 1, 1, 1, 1])
-  echo "jitMul = ", r
+  r.jitFpAdd64([uint64 1, 2, 3, 4], [uint64 1, 1, 1, 1])
+  echo "jitFpAdd64 = ", r
 
   # block:
   #   Cleanup - Assembler_LLVM is auto-managed
   #   engine.dispose()  # also destroys the module attached to it, which double_frees Assembler_LLVM asy.module
-  echo "LLVM JIT - calling big_mul_64x4 SUCCESS"
+  echo "LLVM JIT - calling FpAdd64 SUCCESS"
 
   # --------------------------------------------
-  # See the assembly- note it might be different from what the JIT compiler did
+  # See the assembly - note it might be different from what the JIT compiler did
 
   const triple = "x86_64-pc-linux-gnu"
 
@@ -159,169 +99,4 @@ when isMainModule:
   echo machine.emitTo[:string](asy.module, AssemblyFile)
   echo "========================================="
 
-  # Output
-  # ------------------------------------------------------------------
-
-  #[
-  LLVM JIT compiler: Multiplication with MULX/ADOX/ADCX
-  =========================================
-  LLVM IR
-
-  ; ModuleID = 'x86_poc'
-  source_filename = "x86_poc"
-  target triple = "x86_64-pc-linux-gnu"
-
-  define void @big_mul_64x4(ptr %0, ptr %1, ptr %2) {
-  bigMulBody:
-    %3 = alloca [4 x i64], align 8
-    %4 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    %5 = load i64, ptr %4, align 4
-    %6 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    %7 = load i64, ptr %6, align 4
-    %8 = xor i64 %5, %7
-    %9 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    store i64 %8, ptr %9, align 4
-    %10 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 0
-    %11 = load i64, ptr %10, align 4
-    %12 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0
-    %13 = load i64, ptr %12, align 4
-    %mulx64_ = call i128 @hw_mulExt64(i64 %11, i64 %13)
-    %lo64_ = call i64 @hw_lo64(i128 %mulx64_)
-    %hi64_ = call i64 @hw_hi64(i128 %mulx64_)
-    %14 = getelementptr inbounds [4 x i64], ptr %0, i32 0, i32 0
-    store i64 %lo64_, ptr %14, align 4
-    %15 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    store i64 %hi64_, ptr %15, align 4
-    %16 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 1
-    %17 = load i64, ptr %16, align 4
-    %18 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0
-    %19 = load i64, ptr %18, align 4
-    %mulx64_1 = call i128 @hw_mulExt64(i64 %17, i64 %19)
-    %lo64_2 = call i64 @hw_lo64(i128 %mulx64_1)
-    %hi64_3 = call i64 @hw_hi64(i128 %mulx64_1)
-    %20 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 1
-    store i64 %hi64_3, ptr %20, align 4
-    %21 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    %22 = load i64, ptr %21, align 4
-    %23 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %22, i64 %lo64_2)
-    %24 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 2
-    %25 = load i64, ptr %24, align 4
-    %26 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0
-    %27 = load i64, ptr %26, align 4
-    %mulx64_4 = call i128 @hw_mulExt64(i64 %25, i64 %27)
-    %lo64_5 = call i64 @hw_lo64(i128 %mulx64_4)
-    %hi64_6 = call i64 @hw_hi64(i128 %mulx64_4)
-    %28 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 2
-    store i64 %hi64_6, ptr %28, align 4
-    %29 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 1
-    %30 = load i64, ptr %29, align 4
-    %31 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %30, i64 %lo64_5)
-    %32 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 3
-    %33 = load i64, ptr %32, align 4
-    %34 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0
-    %35 = load i64, ptr %34, align 4
-    %mulx64_7 = call i128 @hw_mulExt64(i64 %33, i64 %35)
-    %lo64_8 = call i64 @hw_lo64(i128 %mulx64_7)
-    %hi64_9 = call i64 @hw_hi64(i128 %mulx64_7)
-    %36 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 3
-    store i64 %hi64_9, ptr %36, align 4
-    %37 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 2
-    %38 = load i64, ptr %37, align 4
-    %39 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %38, i64 %lo64_8)
-    %40 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 3
-    %41 = load i64, ptr %40, align 4
-    %42 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %41, i64 0)
-    %43 = load [4 x i64], ptr %3, align 4
-    store [4 x i64] %43, ptr %0, align 4
-    ret void
-  }
-
-  define i64 @hw_hi64(i128 %0) {
-  hiBody:
-    %1 = lshr i128 %0, 64
-    %2 = trunc i128 %1 to i64
-    ret i64 %2
-  }
-
-  define i64 @hw_lo64(i128 %0) {
-  loBody:
-    %1 = trunc i128 %0 to i64
-    ret i64 %1
-  }
-
-  define i128 @hw_mulExt64(i64 %0, i64 %1) {
-  mulExtBody:
-    %2 = zext i64 %0 to i128
-    %3 = zext i64 %1 to i128
-    %4 = mul i128 %2, %3
-    ret i128 %4
-  }
-
-  =========================================
-  jitMul = [0, 0, 0, 0]
-  LLVM JIT - calling big_mul_64x4 SUCCESS
-  =========================================
-  Assembly
-
-          .text
-          .file   "x86_poc"
-          .globl  big_mul_64x4
-          .p2align        4, 0x90
-          .type   big_mul_64x4,@function
-  big_mul_64x4:
-          .cfi_startproc
-          movq    %rdx, %rcx
-          movq    (%rdx), %rax
-          mulq    (%rsi)
-          movq    %rdx, %r8
-          movq    %rax, (%rdi)
-          movq    (%rcx), %rcx
-          movq    %rcx, %rax
-          mulq    8(%rsi)
-          movq    %rdx, %r9
-          movq    %rcx, %rax
-          mulq    16(%rsi)
-          movq    %rdx, %r10
-          movq    %rcx, %rax
-          mulq    24(%rsi)
-          movq    %r8, (%rdi)
-          movq    %r9, 8(%rdi)
-          movq    %r10, 16(%rdi)
-          movq    %rdx, 24(%rdi)
-          retq
-  .Lfunc_end0:
-          .size   big_mul_64x4, .Lfunc_end0-big_mul_64x4
-          .cfi_endproc
-
-          .globl  hw_hi64
-          .p2align        4, 0x90
-          .type   hw_hi64,@function
-  hw_hi64:
-          movq    %rsi, %rax
-          retq
-  .Lfunc_end1:
-          .size   hw_hi64, .Lfunc_end1-hw_hi64
-
-          .globl  hw_lo64
-          .p2align        4, 0x90
-          .type   hw_lo64,@function
-  hw_lo64:
-          movq    %rdi, %rax
-          retq
-  .Lfunc_end2:
-          .size   hw_lo64, .Lfunc_end2-hw_lo64
-
-          .globl  hw_mulExt64
-          .p2align        4, 0x90
-          .type   hw_mulExt64,@function
-  hw_mulExt64:
-          movq    %rsi, %rax
-          mulq    %rdi
-          retq
-  .Lfunc_end3:
-          .size   hw_mulExt64, .Lfunc_end3-hw_mulExt64
-
-          .section        ".note.GNU-stack","",@progbits
-
-  =========================================
-  ]#
+t_field_add(Secp256k1)