From 432a91e3a935915c91e2946dd4144ee1cf10cb07 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Mon, 5 Aug 2024 22:25:03 +0200 Subject: [PATCH] feat(LLVM): add codegenerator for saturated field add/sub --- PLANNING.md | 8 + .../assembly/limbs_asm_modular_x86.nim | 2 +- constantine/math_compiler/codegen_nvidia.nim | 4 +- .../math_compiler/impl_fields_nvidia.nim | 13 +- constantine/math_compiler/impl_fields_sat.nim | 166 ++++++++++ constantine/math_compiler/ir.nim | 6 + constantine/platforms/abis/llvm_abi.nim | 32 +- .../extended_precision_64bit_uint128.nim | 4 +- .../extended_precision_x86_64_msvc.nim | 6 +- .../{nvidia_inlineasm.nim => asm_nvidia.nim} | 0 .../platforms/llvm/asm_x86.nim | 0 constantine/platforms/llvm/llvm.nim | 35 ++- .../platforms/llvm/super_instructions.nim | 232 ++++++++++++++ research/codegen/x86_instr.nim | 96 ------ research/codegen/x86_poc.nim | 287 ++---------------- 15 files changed, 513 insertions(+), 378 deletions(-) create mode 100644 constantine/math_compiler/impl_fields_sat.nim rename constantine/platforms/llvm/{nvidia_inlineasm.nim => asm_nvidia.nim} (100%) rename research/codegen/x86_inlineasm.nim => constantine/platforms/llvm/asm_x86.nim (100%) create mode 100644 constantine/platforms/llvm/super_instructions.nim delete mode 100644 research/codegen/x86_instr.nim diff --git a/PLANNING.md b/PLANNING.md index 7fd6f956..b4258d26 100644 --- a/PLANNING.md +++ b/PLANNING.md @@ -101,6 +101,14 @@ Other tracks are stretch goals, contributions towards them are accepted. - introduce batchAffine_vartime - Optimized square_repeated in assembly for Montgomery and Crandall/Pseudo-Mersenne primes - Optimized elliptic curve directly calling assembly without ADX checks and limited input/output movement in registers or using function multi-versioning. +- LLVM IR: + - use internal or private linkage type + - look into calling conventions like "fast" or "Tail fast" + - check if returning a value from function is propely optimized + compared to in-place result + - use readnone (pure) and readmem attribute for functions + - look into passing parameter as arrays instead of pointers? + - use hot function attribute ### User Experience track diff --git a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim index 12da5fcd..0db85007 100644 --- a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim +++ b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim @@ -80,7 +80,7 @@ proc finalSubMayOverflowImpl*( ctx.mov scratch[i], a[i] ctx.sbb scratch[i], M[i] - # If it overflows here, it means that it was + # If it underflows here, it means that it was # smaller than the modulus and we don't need `scratch` ctx.sbb scratchReg, 0 diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim index 19e92019..63245f95 100644 --- a/constantine/math_compiler/codegen_nvidia.nim +++ b/constantine/math_compiler/codegen_nvidia.nim @@ -9,12 +9,12 @@ import constantine/platforms/abis/nvidia_abi {.all.}, constantine/platforms/abis/c_abi, - constantine/platforms/llvm/[llvm, nvidia_inlineasm], + constantine/platforms/llvm/llvm, constantine/platforms/primitives, ./ir export - nvidia_abi, nvidia_inlineasm, + nvidia_abi, Flag, flag, wrapOpenArrayLenType # ############################################################ diff --git a/constantine/math_compiler/impl_fields_nvidia.nim b/constantine/math_compiler/impl_fields_nvidia.nim index 0ffbb5b1..6b034701 100644 --- a/constantine/math_compiler/impl_fields_nvidia.nim +++ b/constantine/math_compiler/impl_fields_nvidia.nim @@ -7,8 +7,8 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - ../platforms/llvm/llvm, - ./ir, ./codegen_nvidia + constantine/platforms/llvm/[llvm, asm_nvidia], + ./ir # ############################################################ # @@ -40,8 +40,11 @@ import # but the carry codegen of madc.hi.cc.u64 has off-by-one # - https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067 # - old 32-bit bug: https://forums.developer.nvidia.com/t/wrong-result-returned-by-madc-hi-u64-ptx-instruction-for-specific-operands/196094 +# +# See instruction throughput +# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions -proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) = +proc finalSubMayOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) = ## If a >= Modulus: r <- a-M ## else: r <- a ## @@ -74,7 +77,7 @@ proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, for i in 0 ..< N: r[i] = bld.slct(scratch[i], a[i], underflowedModulus) -proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) = +proc finalSubNoOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) = ## If a >= Modulus: r <- a-M ## else: r <- a ## @@ -354,4 +357,4 @@ proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field: proc field_mul_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub = false): FnDef = ## Generate an optimized modular addition kernel ## with parameters `a, b, modulus: Limbs -> Limbs` - return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub) \ No newline at end of file + return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub) diff --git a/constantine/math_compiler/impl_fields_sat.nim b/constantine/math_compiler/impl_fields_sat.nim new file mode 100644 index 00000000..4910548d --- /dev/null +++ b/constantine/math_compiler/impl_fields_sat.nim @@ -0,0 +1,166 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + constantine/platforms/llvm/[llvm, super_instructions], + ./ir, ./codegen_nvidia + +# ############################################################ +# +# Field arithmetic with saturated limbs +# +# ############################################################ +# +# This implements field operations in pure LLVM +# using saturated limbs, i.e. 64-bit words on 64-bit platforms. +# +# This relies on hardware addition-with-carry and substraction-with-borrow +# for efficiency. +# +# As such it is not suitable for platforms with no carry flags such as: +# - WASM +# - MIPS +# - RISC-V +# - Metal +# +# It may be suitable for Intel GPUs as the virtual ISA does support add-carry +# +# It is suitable for: +# - ARM +# - AMD GPUs (for prototyping) +# +# The following backends have better optimizations through assembly: +# - x86: access to ADOX and ADCX interleaved double-carry chain +# - Nvidia: access to multiply accumulate instruction +# and non-interleaved double-carry chain +# +# AMD GPUs may benefits from using 24-bit limbs +# - https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/programmer-references/AMD_OpenCL_Programming_Optimization_Guide2.pdf +# p2-23: +# Generally, the throughput and latency for 32-bit integer operations is the same +# as for single-precision floating point operations. +# 24-bit integer MULs and MADs have four times the throughput of 32-bit integer +# multiplies. 24-bit signed and unsigned integers are natively supported on the +# GCN family of devices. The use of OpenCL built-in functions for mul24 and mad24 +# is encouraged. Note that mul24 can be useful for array indexing operations +# Doc from 2015, it might not apply to RDNA family +# - https://free.eol.cn/edu_net/edudown/AMDppt/OpenCL%20Programming%20and%20Optimization%20-%20Part%20I.pdf +# slide 24 +# +# - https://chipsandcheese.com/2023/01/07/microbenchmarking-amds-rdna-3-graphics-architecture/ +# "Since Turing, Nvidia also achieves very good integer multiplication performance. +# Integer multiplication appears to be extremely rare in shader code, +# and AMD doesn’t seem to have optimized for it. +# 32-bit integer multiplication executes at around a quarter of FP32 rate, +# and latency is pretty high too." + +proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array, carry: ValueRef) = + ## If a >= Modulus: r <- a-M + ## else: r <- a + ## + ## This is constant-time straightline code. + ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU. + ## + ## To be used when the final substraction can + ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256) + + let bld = asy.builder + let fieldTy = cm.getFieldType(field) + let wordTy = cm.getWordType(field) + let scratch = bld.makeArray(fieldTy) + let M = cm.getModulus(field) + let N = M.len + + let zero_i1 = constInt(asy.i1_t, 0) + let zero = constInt(wordTy, 0) + + # Mask: contains 0xFFFF or 0x0000 + let (_, mask) = bld.subborrow(zero, zero, carry) + + # Now substract the modulus, and test a < M + # (underflow) with the last borrow + var b: ValueRef + (b, scratch[0]) = bld.subborrow(a[0], M[0], zero_i1) + for i in 1 ..< N: + (b, scratch[i]) = bld.subborrow(a[i], M[i], b) + + # If it underflows here, it means that it was + # smaller than the modulus and we don't need `scratch` + (b, _) = bld.subborrow(mask, zero, b) + + for i in 0 ..< N: + r[i] = bld.select(b, a[i], scratch[i]) + +proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) = + ## If a >= Modulus: r <- a-M + ## else: r <- a + ## + ## This is constant-time straightline code. + ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU. + ## + ## To be used when the modulus does not use the full bitwidth of the storing words + ## (say using 255 bits for the modulus out of 256 available in words) + + let bld = asy.builder + let fieldTy = cm.getFieldType(field) + let scratch = bld.makeArray(fieldTy) + let M = cm.getModulus(field) + let N = M.len + + # Now substract the modulus, and test a < M with the last borrow + let zero_i1 = constInt(asy.i1_t, 0) + var b: ValueRef + (b, scratch[0]) = bld.subborrow(a[0], M[0], zero_i1) + for i in 1 ..< N: + (b, scratch[i]) = bld.subborrow(a[i], M[i], b) + + # If it underflows here a was smaller than the modulus, which is what we want + for i in 0 ..< N: + r[i] = bld.select(b, a[i], scratch[i]) + +proc field_add_gen_sat*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef = + ## Generate an optimized modular addition kernel + ## with parameters `a, b, modulus: Limbs -> Limbs` + + let procName = cm.genSymbol(block: + case field + of fp: opFpAdd + of fr: opFrAdd) + let fieldTy = cm.getFieldType(field) + let pFieldTy = pointer_t(fieldTy) + + let addModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy]) + let addModKernel = asy.module.addFunction(cstring procName, addModTy) + let blck = asy.ctx.appendBasicBlock(addModKernel, "addModSatBody") + asy.builder.positionAtEnd(blck) + + let bld = asy.builder + + let r = bld.asArray(addModKernel.getParam(0), fieldTy) + let a = bld.asArray(addModKernel.getParam(1), fieldTy) + let b = bld.asArray(addModKernel.getParam(2), fieldTy) + + let t = bld.makeArray(fieldTy) + let N = cm.getNumWords(field) + + var c: ValueRef + let zero = constInt(asy.i1_t, 0) + + (c, t[0]) = bld.addcarry(a[0], b[0], zero) + for i in 1 ..< N: + (c, t[i]) = bld.addcarry(a[i], b[i], c) + + if cm.getSpareBits(field) >= 1: + asy.finalSubNoOverflow(cm, field, t, t) + else: + asy.finalSubMayOverflow(cm, field, t, t, c) + + bld.store(r, t) + bld.retVoid() + + return (addModTy, addModKernel) diff --git a/constantine/math_compiler/ir.nim b/constantine/math_compiler/ir.nim index 1523fdab..23464a72 100644 --- a/constantine/math_compiler/ir.nim +++ b/constantine/math_compiler/ir.nim @@ -265,6 +265,12 @@ func getFieldType*(cm: CurveMetadata, field: Field): TypeRef {.inline.} = else: return cm.fr.fieldTy +func getWordType*(cm: CurveMetadata, field: Field): TypeRef {.inline.} = + if field == fp: + return cm.fp.wordTy + else: + return cm.fr.wordTy + func getNumWords*(cm: CurveMetadata, field: Field): int {.inline.} = case field of fp: diff --git a/constantine/platforms/abis/llvm_abi.nim b/constantine/platforms/abis/llvm_abi.nim index 7fcf342f..974e29c1 100644 --- a/constantine/platforms/abis/llvm_abi.nim +++ b/constantine/platforms/abis/llvm_abi.nim @@ -601,17 +601,17 @@ type ## An instruction builder represents a point within a basic block and is ## the exclusive means of building instructions using the C interface. - IntPredicate* {.size: sizeof(cint).} = enum - IntEQ = 32 ## equal - IntNE ## not equal - IntUGT ## unsigned greater than - IntUGE ## unsigned greater or equal - IntULT ## unsigned less than - IntULE ## unsigned less or equal - IntSGT ## signed greater than - IntSGE ## signed greater or equal - IntSLT ## signed less than - IntSLE ## signed less or equal + Predicate* {.size: sizeof(cint).} = enum + kEQ = 32 ## equal + kNE ## not equal + kUGT ## unsigned greater than + kUGE ## unsigned greater or equal + kULT ## unsigned less than + kULE ## unsigned less or equal + kSGT ## signed greater than + kSGE ## signed greater or equal + kSLT ## signed less than + kSLE ## signed less or equal InlineAsmDialect* {.size: sizeof(cint).} = enum InlineAsmDialectATT @@ -675,19 +675,27 @@ proc call2*( proc add*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAdd".} proc addNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWAdd".} + ## Addition No Signed Wrap, i.e. guaranteed to not overflow proc addNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWAdd".} + ## Addition No Unsigned Wrap, i.e. guaranteed to not overflow proc sub*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSub".} proc subNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWSub".} + ## Substraction No Signed Wrap, i.e. guaranteed to not overflow proc subNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWSub".} + ## Substraction No Unsigned Wrap, i.e. guaranteed to not overflow proc neg*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNeg".} proc negNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWNeg".} + ## Negation No Signed Wrap, i.e. guaranteed to not overflow proc negNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWNeg".} + ## Negation No Unsigned Wrap, i.e. guaranteed to not overflow proc mul*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildMul".} proc mulNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWMul".} + ## Multiplication No Signed Wrap, i.e. guaranteed to not overflow proc mulNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWMul".} + ## Multiplication No Unsigned Wrap, i.e. guaranteed to not overflow proc divU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildUDiv".} proc divU_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildExactUDiv".} @@ -706,7 +714,7 @@ proc `xor`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueR proc `not`*(builder: BuilderRef, val: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNot".} proc select*(builder: BuilderRef, condition, then, otherwise: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSelect".} -proc icmp*(builder: BuilderRef, op: IntPredicate, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildICmp".} +proc icmp*(builder: BuilderRef, op: Predicate, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildICmp".} proc bitcast*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildBitcast".} proc trunc*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildTrunc".} diff --git a/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim b/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim index 345acd8f..f0e438a2 100644 --- a/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim +++ b/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim @@ -85,7 +85,7 @@ func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}= {.emit:["*",lo, " = (NU64)", dblPrec,";"].} func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} = - ## Extended precision multiplication + ## Signed extended precision multiplication ## (hi, lo) <- a*b ## ## Inputs are intentionally unsigned @@ -103,4 +103,4 @@ func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} = {.emit:[lo, " = (NU64)", dblPrec,";"].} else: {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].} - {.emit:["*",lo, " = (NU64)", dblPrec,";"].} \ No newline at end of file + {.emit:["*",lo, " = (NU64)", dblPrec,";"].} diff --git a/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim b/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim index 3216b859..06bde04d 100644 --- a/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim +++ b/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim @@ -77,12 +77,12 @@ func smul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_mul12 ## as we use their unchecked raw representation for cryptography func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} = - ## Extended precision multiplication + ## Signed extended precision multiplication ## (hi, lo) <- a*b ## ## Inputs are intentionally unsigned ## as we use their unchecked raw representation for cryptography - ## + ## ## This is constant-time on most hardware ## See: https://www.bearssl.org/ctmul.html - lo = smul128(a, b, hi) \ No newline at end of file + lo = smul128(a, b, hi) diff --git a/constantine/platforms/llvm/nvidia_inlineasm.nim b/constantine/platforms/llvm/asm_nvidia.nim similarity index 100% rename from constantine/platforms/llvm/nvidia_inlineasm.nim rename to constantine/platforms/llvm/asm_nvidia.nim diff --git a/research/codegen/x86_inlineasm.nim b/constantine/platforms/llvm/asm_x86.nim similarity index 100% rename from research/codegen/x86_inlineasm.nim rename to constantine/platforms/llvm/asm_x86.nim diff --git a/constantine/platforms/llvm/llvm.nim b/constantine/platforms/llvm/llvm.nim index d222a306..38addee5 100644 --- a/constantine/platforms/llvm/llvm.nim +++ b/constantine/platforms/llvm/llvm.nim @@ -175,6 +175,12 @@ proc function_t*(returnType: TypeRef, paramTypes: openArray[TypeRef]): TypeRef { # Values # ------------------------------------------------------------ +# TODO: remove ConstValueRef +# - This is used in `selConstraint` in asm_nvidia +# to choose the `n` literal constraint. +# Instead of inlining as literal and hurting instruction decoding +# with large 8 bytes value, we load from const memory. + type ConstValueRef* = distinct ValueRef AnyValueRef* = ValueRef or ConstValueRef @@ -186,7 +192,34 @@ proc getName*(v: ValueRef): string = result = newString(rLen.int) copyMem(result[0].addr, rStr, rLen.int) -proc constInt*(ty: TypeRef, n: uint64, signExtend = false): ConstValueRef {.inline.} = +proc constInt*(ty: TypeRef, n: SomeInteger, signExtend = false): ConstValueRef {.inline.} = ConstValueRef constInt(ty, culonglong(n), LlvmBool(signExtend)) proc getTypeOf*(v: ConstValueRef): TypeRef {.borrow.} +proc zext*(builder: BuilderRef, val: ConstValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.inline.} = + ## Zero-extend + builder.zext(ValueRef val, destTy, name) +proc sext*(builder: BuilderRef, val: ConstValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.inline.} = + ## Sign-extend + builder.sext(ValueRef val, destTy, name) + +proc add*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} = + builder.add(ValueRef lhs, ValueRef rhs, name) +proc addNSW*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} = + ## Addition No Signed Wrap, i.e. guaranteed to not overflow + builder.addNSW(ValueRef lhs, ValueRef rhs, name) +proc addNUW*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} = + ## Addition No Unsigned Wrap, i.e. guaranteed to not overflow + builder.addNUW(ValueRef lhs, ValueRef rhs, name) + +proc sub*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} = + builder.sub(ValueRef lhs, ValueRef rhs, name) +proc subNSW*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} = + ## Substraction No Signed Wrap, i.e. guaranteed to not overflow + builder.subNSW(ValueRef lhs, ValueRef rhs, name) +proc subNUW*(builder: BuilderRef, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} = + ## Substraction No Unsigned Wrap, i.e. guaranteed to not overflow + builder.subNUW(ValueRef lhs, ValueRef rhs, name) + +proc icmp*(builder: BuilderRef, op: Predicate, lhs, rhs: distinct AnyValueRef, name: cstring = ""): ValueRef {.inline.} = + builder.icmp(op, ValueRef lhs, ValueRef rhs, name) diff --git a/constantine/platforms/llvm/super_instructions.nim b/constantine/platforms/llvm/super_instructions.nim new file mode 100644 index 00000000..f090785e --- /dev/null +++ b/constantine/platforms/llvm/super_instructions.nim @@ -0,0 +1,232 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./llvm + +# ############################################################ +# +# LLVM IR super-instructions +# +# ############################################################ + +# This defines a collection of LLVM IR super-instructions +# Ideally those super-instructions compile-down +# to ISA optimized single instructions +# +# To ensure this, tests can be consulted at: +# https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/ + +# Add-carry: +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/add-of-carry.ll +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/addcarry.ll +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/addcarry2.ll +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/adx-intrinsics.ll +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/adx-intrinsics-upgrade.ll +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/apx/adc.ll +# +# Sub-borrow +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/sub-with-overflow.ll +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/AArch64/cgp-usubo.ll +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/cgp-usubo.ll +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/apx/sbb.ll +# +# Multiplication +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/mulx32.ll +# - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/mulx64.ll + +# Warning: +# +# There is no guarantee of constant-time with LLVM IR +# It MAY introduce branches. +# For workload that involves private keys or secrets +# assembly MUST be used +# +# Alternatively an assembly source file must be generated +# and checked in the repo to avoid regressions should +# the compiler "progress" +# +# - https://github.com/mratsim/constantine/wiki/Constant-time-arithmetics#fighting-the-compiler +# - https://blog.cr.yp.to/20240803-clang.html +# - https://www.cl.cam.ac.uk/~rja14/Papers/whatyouc.pdf + +proc hi(bld: BuilderRef, val: ValueRef, baseTy: TypeRef, oversize: uint32, prefix: string): ValueRef = + let ctx = bld.getContext() + let bits = baseTy.getIntTypeWidth() + let overTy = ctx.int_t(bits + oversize) + + # %hi_shift_1 = zext i8 64 to i128 + let s = constInt(ctx.int8_t(), oversize) + let shift = bld.zext(s, overTy, name = cstring(prefix & "S_")) + # %hiLarge_1 = lshr i128 %input, %hi_shift_1 + let hiLarge = bld.lshr(val, shift, name = cstring(prefix & "L_")) + # %hi_1 = trunc i128 %hiLarge_1 to i64 + let hi = bld.trunc(hiLarge, baseTy, name = cstring(prefix & "_")) + + return hi + +proc addcarry*(bld: BuilderRef, a, b, carryIn: distinct AnyValueRef): tuple[carryOut, r: ValueRef] = + ## (cOut, result) <- a+b+cIn + let ctx = bld.getContext() + let ty = a.getTypeOf() + let bits = ty.getIntTypeWidth() + + let add = bld.add(a, b, name = "adc01_") + let carry0 = bld.icmp(kULT, add, b, name = "adc01c_") + let cIn = bld.zext(carryIn, ty, name = "adc2_") + let adc = bld.add(cIn, add, name = "adc_") + let carry1 = bld.icmp(kULT, adc, add, name = "adc012c_") + let carryOut = bld.`or`(carry0, carry1, name = "cOut_") + + return (carryOut, adc) + +proc subborrow*(bld: BuilderRef, a, b, borrowIn: distinct AnyValueRef): tuple[borrowOut, r: ValueRef] = + ## (bOut, result) <- a-b-bIn + let ctx = bld.getContext() + let ty = a.getTypeOf() + let bits = ty.getIntTypeWidth() + + let sub = bld.sub(a, b, name = "sbb01_") + let borrow0 = bld.icmp(kULT, a, b, name = "sbb01b_") + let bIn = bld.zext(borrowIn, ty, name = "sbb2_") + let sbb = bld.sub(sub, bIn, name = "sbb_") + let borrow1 = bld.icmp(kULT, sub, bIn, name = "sbb012b_") + let borrowOut = bld.`or`(borrow0, borrow1, name = "bOut_") + + return (borrowOut, sbb) + +proc mulExt*(bld: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] = + ## Extended precision multiplication + ## (hi, lo) <- a*b + let ctx = bld.getContext() + let ty = a.getTypeOf() + let bits = ty.getIntTypeWidth() + let dbl = bits shl 1 + let dblTy = ctx.int_t(dbl) + + let a = bld.zext(a, dblTy, name = "mulx0_") + let b = bld.zext(b, dblTy, name = "mulx1_") + let r = bld.mulNUW(a, b, name = "mulx_") + + let lo = bld.trunc(r, ty, name = "mullo_") + let hi = bld.hi(r, ty, oversize = bits, prefix = "mulhi_") + return (hi, lo) + +proc smulExt*(bld: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] = + ## Signed extended precision multiplication + ## (hi, lo) <- a*b + let ctx = bld.getContext() + let ty = a.getTypeOf() + let bits = ty.getIntTypeWidth() + let dbl = bits shl 1 + let dblTy = ctx.int_t(dbl) + + let a = bld.sext(a, dblTy, name = "smulx0_") + let b = bld.sext(b, dblTy, name = "smulx1_") + let r = bld.mulNSW(a, b, name = "smulx0_") + + let lo = bld.trunc(r, ty, name = "smullo_") + let hi = bld.hi(r, ty, oversize = bits, prefix = "smulhi_") + return (hi, lo) + +proc muladd1*(bld: BuilderRef, a, b, c: ValueRef): tuple[hi, lo: ValueRef] = + ## Extended precision multiplication + addition + ## (hi, lo) <- a*b + c + ## + ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001) + ## so adding any c cannot overflow + let ctx = bld.getContext() + let ty = a.getTypeOf() + let bits = ty.getIntTypeWidth() + let dbl = bits shl 1 + let dblTy = ctx.int_t(dbl) + + let a = bld.zext(a, dblTy, name = "fmax0_") + let b = bld.zext(b, dblTy, name = "fmax1_") + let ab = bld.mulNUW(a, b, name = "fmax01_") + + let c = bld.zext(c, dblTy, name = "fmax2_") + let r = bld.addNUW(ab, c, name = "fmax_") + + let lo = bld.trunc(r, ty, name = "fmalo_") + let hi = bld.hi(r, ty, oversize = bits, prefix = "fmahi_") + return (hi, lo) + +proc muladd2*(bld: BuilderRef, a, b, c1, c2: ValueRef): tuple[hi, lo: ValueRef] = + ## Extended precision multiplication + addition + addition + ## (hi, lo) <- a*b + c1 + c2 + ## + ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001) + ## so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000) + ## and we have enough space to add again 0xFFFFFFFF without overflowing + let ctx = bld.getContext() + let ty = a.getTypeOf() + let bits = ty.getIntTypeWidth() + let dbl = bits shl 1 + let dblTy = ctx.int_t(dbl) + + let a = bld.zext(a, dblTy, name = "fmaa0_") + let b = bld.zext(b, dblTy, name = "fmaa1_") + let ab = bld.mulNUW(a, b, name = "fmaa01_") + + let c1 = bld.zext(c1, dblTy, name = "fmaa2_") + let abc1 = bld.addNUW(ab, c1, name = "fmaa012_") + let c2 = bld.zext(c2, dblTy, name = "fmaa3_") + let r = bld.addNUW(abc1, c2, name = "fmaa_") + + let lo = bld.trunc(r, ty, name = "fmaalo_") + let hi = bld.hi(r, ty, oversize = bits, prefix = "fmaahi_") + return (hi, lo) + +proc mulAcc*(bld: BuilderRef, tuv: var ValueRef, a, b: ValueRef) = + ## (t, u, v) <- (t, u, v) + a * b + let ctx = bld.getContext() + + let ty = a.getTypeOf() + let bits = ty.getIntTypeWidth() + + let x3ty = tuv.getTypeOf() + let x3bits = x3ty.getIntTypeWidth() + + doAssert bits * 3 == x3bits + + let dbl = bits shl 1 + let dblTy = ctx.int_t(dbl) + + let a = bld.zext(a, dblTy, name = "mac0_") + let b = bld.zext(b, dblTy, name = "mac1_") + let ab = bld.mulNUW(a, b, name = "mac01_") + + let wide_ab = bld.zext(ab, x3ty, name = "mac01x_") + let r = bld.addNUW(tuv, wide_ab, "mac_") + + tuv = r + +proc mulDoubleAcc*(bld: BuilderRef, tuv: var ValueRef, a, b: ValueRef) = + ## (t, u, v) <- (t, u, v) + 2 * a * b + let ctx = bld.getContext() + + let ty = a.getTypeOf() + let bits = ty.getIntTypeWidth() + + let x3ty = tuv.getTypeOf() + let x3bits = x3ty.getIntTypeWidth() + + doAssert bits * 3 == x3bits + + let dbl = bits shl 1 + let dblTy = ctx.int_t(dbl) + + let a = bld.zext(a, dblTy, name = "macd0_") + let b = bld.zext(b, dblTy, name = "macd1_") + let ab = bld.mulNUW(a, b, name = "macd01_") + + let wide_ab = bld.zext(ab, x3ty, name = "macd01x_") + let r1 = bld.addNUW(tuv, wide_ab, "macdpart_") + let r2 = bld.addNUW(r1, wide_ab, "macd_") + + tuv = r2 diff --git a/research/codegen/x86_instr.nim b/research/codegen/x86_instr.nim deleted file mode 100644 index a4a19219..00000000 --- a/research/codegen/x86_instr.nim +++ /dev/null @@ -1,96 +0,0 @@ -# Constantine -# Copyright (c) 2018-2019 Status Research & Development GmbH -# Copyright (c) 2020-Present Mamy André-Ratsimbazafy -# Licensed and distributed under either of -# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). -# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). -# at your option. This file may not be copied, modified, or distributed except according to those terms. - -import - constantine/platforms/abis/c_abi, - constantine/platforms/llvm/llvm, - constantine/platforms/primitives, - constantine/math_compiler/ir, - ./x86_inlineasm - -export x86_inlineasm - -# ############################################################ -# -# x86 API -# -# ############################################################ - -proc defMulExt*(asy: Assembler_LLVM, wordSize: int): FnDef = - - let procName = if wordSize == 64: cstring"hw_mulExt64" - else: cstring"hw_mulExt32" - - let doublePrec_t = if wordSize == 64: asy.i128_t - else: asy.i64_t - - let mulExtTy = if wordSize == 64: function_t(doublePrec_t, [asy.i64_t, asy.i64_t]) - else: function_t(doublePrec_t, [asy.i32_t, asy.i32_t]) - let mulExtKernel = asy.module.addFunction(procName, mulExtTy) - let blck = asy.ctx.appendBasicBlock(mulExtKernel, "mulExtBody") - asy.builder.positionAtEnd(blck) - - let bld = asy.builder - - let a = bld.zext(mulExtKernel.getParam(0), doublePrec_t) - let b = bld.zext(mulExtKernel.getParam(1), doublePrec_t) - let r = bld.mul(a, b) - - bld.ret r - - return (mulExtTy, mulExtKernel) - -proc defHi*(asy: Assembler_LLVM, wordSize: int): FnDef = - - let procName = if wordSize == 64: cstring"hw_hi64" - else: cstring"hw_hi32" - let doublePrec_t = if wordSize == 64: asy.i128_t - else: asy.i64_t - let singlePrec_t = if wordSize == 64: asy.i64_t - else: asy.i32_t - - let hiTy = function_t(singlePrec_t, [doublePrec_t]) - - let hiKernel = asy.module.addFunction(procName, hiTy) - let blck = asy.ctx.appendBasicBlock(hiKernel, "hiBody") - asy.builder.positionAtEnd(blck) - - let bld = asy.builder - - # %1 = zext i32 64 to i128 - let shift = bld.zext(constInt(asy.i32_t, culonglong wordSize, signExtend = LlvmBool(0)), doublePrec_t) - # %hiLarge = lshr i128 %input, %1 - let hiLarge = bld.lshr(hiKernel.getParam(0), shift) - # %hi = trunc i128 %hiLarge to i64 - let hi = bld.trunc(hiLarge, singlePrec_t) - - bld.ret hi - - return (hiTy, hiKernel) - -proc defLo*(asy: Assembler_LLVM, wordSize: int): FnDef = - - let procName = if wordSize == 64: cstring"hw_lo64" - else: cstring"hw_lo32" - let doublePrec_t = if wordSize == 64: asy.i128_t - else: asy.i64_t - let singlePrec_t = if wordSize == 64: asy.i64_t - else: asy.i32_t - - let loTy = function_t(singlePrec_t, [doublePrec_t]) - - let loKernel = asy.module.addFunction(procName, loTy) - let blck = asy.ctx.appendBasicBlock(loKernel, "loBody") - asy.builder.positionAtEnd(blck) - - let bld = asy.builder - - # %lo = trunc i128 %input to i64 - let lo = bld.trunc(loKernel.getParam(0), singlePrec_t) - bld.ret lo - return (loTy, loKernel) diff --git a/research/codegen/x86_poc.nim b/research/codegen/x86_poc.nim index c5c376fe..a677158e 100644 --- a/research/codegen/x86_poc.nim +++ b/research/codegen/x86_poc.nim @@ -7,95 +7,33 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import + constantine/named/algebras, + constantine/math/io/io_bigints, + constantine/platforms/llvm/llvm, constantine/platforms/primitives, - constantine/math_compiler/ir, - ./x86_instr - -echo "LLVM JIT compiler: Multiplication with MULX/ADOX/ADCX" - -proc big_mul_gen(asy: Assembler_LLVM): FnDef = - - - let procName = "big_mul_64x4" - let N = 4 - let ty = array_t(asy.i64_t, N) - let pty = pointer_t(ty) - - let bigMulTy = function_t(asy.void_t, [pty, pty, pty]) - let bigMulKernel = asy.module.addFunction(cstring procName, bigMulTy) - let blck = asy.ctx.appendBasicBlock(bigMulKernel, "bigMulBody") - asy.builder.positionAtEnd(blck) - - let bld = asy.builder - - let (hiTy, hiKernel) = asy.defHi(64) - proc hi(builder: BuilderRef, a: ValueRef): ValueRef = - return builder.call2( - hiTy, hiKernel, - [a], "hi64_" - ) - - let (loTy, loKernel) = asy.defLo(64) - proc lo(builder: BuilderRef, a: ValueRef): ValueRef = - return builder.call2( - loTy, loKernel, - [a], "lo64_" - ) - - let (mulExtTy, mulExtKernel) = asy.defMulExt(64) - bld.positionAtEnd(blck) - - proc mulx(builder: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] = - # LLVM does not support multipel return value at the moment - # https://nondot.org/sabre/LLVMNotes/MultipleReturnValues.txt - # So we don't create an LLVM function - let t = builder.call2( - mulExtTy, mulExtKernel, - [a, b], "mulx64_" - ) - - builder.positionAtEnd(blck) - let lo = builder.lo(t) - let hi = builder.hi(t) - return (hi, lo) - - let r = bld.asArray(bigMulKernel.getParam(0), ty) - let a = bld.asArray(bigMulKernel.getParam(1), ty) - let b = bld.asArray(bigMulKernel.getParam(2), ty) - - let t = bld.makeArray(ty) - - block: # i = 0 - # TODO: properly implement add/adc in pure LLVM - - # TODO: ensure flags are cleared properly, compiler might optimize this away - t[0] = bld.`xor`(t[0], t[0]) - let (hi, lo) = bld.mulx(a[0], b[0]) - r[0] = lo - t[0] = hi - - for j in 1 ..< N: - let (hi , lo) = bld.mulx(a[j], b[0]) - t[j] = hi - # SHOWSTOPPER: LLVM ERROR: Inline asm not supported by this streamer because we don't have an asm parser for this target - discard bld.adcx_rr(t[j-1], lo) # Replace by LLVM IR uadd_with_overflow + constantine/math_compiler/[ir, impl_fields_sat] - # SHOWSTOPPER: LLVM ERROR: Inline asm not supported by this streamer because we don't have an asm parser for this target - discard bld.adcx_rr(t[N-1], 0) +proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Algebra, wordSize: WordSize): T = + CurveMetadata.init( + asy.ctx, + $curve & "_", wordSize, + fpBits = uint32 Fp[curve].bits(), + fpMod = Fp[curve].getModulus().toHex(), + frBits = uint32 Fr[curve].bits(), + frMod = Fr[curve].getModulus().toHex()) - # TODO: rotate t array +proc genFieldAddSat(asy: Assembler_LLVM, cm: CurveMetadata) = + let fpAdd = asy.field_add_gen_sat(cm, fp) + let frAdd = asy.field_add_gen_sat(cm, fr) - # TODO: impl i in 1 ..< N - bld.store(r, t) - bld.retVoid() - return (bigMulTy, bigMulKernel) - -when isMainModule: - # It's not the Nvidia PTX backend but it's fine +proc t_field_add(curve: static Algebra) = let asy = Assembler_LLVM.new(bkX86_64_Linux, cstring("x86_poc")) - let bigMul = asy.big_mul_gen() + let cm32 = CurveMetadata.init(asy, curve, size32) + asy.genFieldAddSat(cm32) + let cm64 = CurveMetadata.init(asy, curve, size64) + asy.genFieldAddSat(cm64) asy.module.verify(AbortProcessAction) @@ -105,26 +43,28 @@ when isMainModule: echo asy.module echo "=========================================" - var engine: ExecutionEngineRef initializeFullNativeTarget() - createJITCompilerForModule(engine, asy.module, optLevel = 0) + createJITCompilerForModule(engine, asy.module, optLevel = 3) - let jitMul = cast[proc(r: var array[4, uint64], a, b: array[4, uint64]){.noconv.}]( - engine.getFunctionAddress("big_mul_64x4") + let fn32 = cm32.genSymbol(opFpAdd) + let fn64 = cm64.genSymbol(opFpAdd) + + let jitFpAdd64 = cast[proc(r: var array[4, uint64], a, b: array[4, uint64]){.noconv.}]( + engine.getFunctionAddress(cstring fn64) ) var r: array[4, uint64] - r.jitMul([uint64 1, 2, 3, 4], [uint64 1, 1, 1, 1]) - echo "jitMul = ", r + r.jitFpAdd64([uint64 1, 2, 3, 4], [uint64 1, 1, 1, 1]) + echo "jitFpAdd64 = ", r # block: # Cleanup - Assembler_LLVM is auto-managed # engine.dispose() # also destroys the module attached to it, which double_frees Assembler_LLVM asy.module - echo "LLVM JIT - calling big_mul_64x4 SUCCESS" + echo "LLVM JIT - calling FpAdd64 SUCCESS" # -------------------------------------------- - # See the assembly- note it might be different from what the JIT compiler did + # See the assembly - note it might be different from what the JIT compiler did const triple = "x86_64-pc-linux-gnu" @@ -159,169 +99,4 @@ when isMainModule: echo machine.emitTo[:string](asy.module, AssemblyFile) echo "=========================================" - # Output - # ------------------------------------------------------------------ - - #[ - LLVM JIT compiler: Multiplication with MULX/ADOX/ADCX - ========================================= - LLVM IR - - ; ModuleID = 'x86_poc' - source_filename = "x86_poc" - target triple = "x86_64-pc-linux-gnu" - - define void @big_mul_64x4(ptr %0, ptr %1, ptr %2) { - bigMulBody: - %3 = alloca [4 x i64], align 8 - %4 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 - %5 = load i64, ptr %4, align 4 - %6 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 - %7 = load i64, ptr %6, align 4 - %8 = xor i64 %5, %7 - %9 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 - store i64 %8, ptr %9, align 4 - %10 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 0 - %11 = load i64, ptr %10, align 4 - %12 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0 - %13 = load i64, ptr %12, align 4 - %mulx64_ = call i128 @hw_mulExt64(i64 %11, i64 %13) - %lo64_ = call i64 @hw_lo64(i128 %mulx64_) - %hi64_ = call i64 @hw_hi64(i128 %mulx64_) - %14 = getelementptr inbounds [4 x i64], ptr %0, i32 0, i32 0 - store i64 %lo64_, ptr %14, align 4 - %15 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 - store i64 %hi64_, ptr %15, align 4 - %16 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 1 - %17 = load i64, ptr %16, align 4 - %18 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0 - %19 = load i64, ptr %18, align 4 - %mulx64_1 = call i128 @hw_mulExt64(i64 %17, i64 %19) - %lo64_2 = call i64 @hw_lo64(i128 %mulx64_1) - %hi64_3 = call i64 @hw_hi64(i128 %mulx64_1) - %20 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 1 - store i64 %hi64_3, ptr %20, align 4 - %21 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 - %22 = load i64, ptr %21, align 4 - %23 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %22, i64 %lo64_2) - %24 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 2 - %25 = load i64, ptr %24, align 4 - %26 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0 - %27 = load i64, ptr %26, align 4 - %mulx64_4 = call i128 @hw_mulExt64(i64 %25, i64 %27) - %lo64_5 = call i64 @hw_lo64(i128 %mulx64_4) - %hi64_6 = call i64 @hw_hi64(i128 %mulx64_4) - %28 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 2 - store i64 %hi64_6, ptr %28, align 4 - %29 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 1 - %30 = load i64, ptr %29, align 4 - %31 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %30, i64 %lo64_5) - %32 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 3 - %33 = load i64, ptr %32, align 4 - %34 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0 - %35 = load i64, ptr %34, align 4 - %mulx64_7 = call i128 @hw_mulExt64(i64 %33, i64 %35) - %lo64_8 = call i64 @hw_lo64(i128 %mulx64_7) - %hi64_9 = call i64 @hw_hi64(i128 %mulx64_7) - %36 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 3 - store i64 %hi64_9, ptr %36, align 4 - %37 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 2 - %38 = load i64, ptr %37, align 4 - %39 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %38, i64 %lo64_8) - %40 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 3 - %41 = load i64, ptr %40, align 4 - %42 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %41, i64 0) - %43 = load [4 x i64], ptr %3, align 4 - store [4 x i64] %43, ptr %0, align 4 - ret void - } - - define i64 @hw_hi64(i128 %0) { - hiBody: - %1 = lshr i128 %0, 64 - %2 = trunc i128 %1 to i64 - ret i64 %2 - } - - define i64 @hw_lo64(i128 %0) { - loBody: - %1 = trunc i128 %0 to i64 - ret i64 %1 - } - - define i128 @hw_mulExt64(i64 %0, i64 %1) { - mulExtBody: - %2 = zext i64 %0 to i128 - %3 = zext i64 %1 to i128 - %4 = mul i128 %2, %3 - ret i128 %4 - } - - ========================================= - jitMul = [0, 0, 0, 0] - LLVM JIT - calling big_mul_64x4 SUCCESS - ========================================= - Assembly - - .text - .file "x86_poc" - .globl big_mul_64x4 - .p2align 4, 0x90 - .type big_mul_64x4,@function - big_mul_64x4: - .cfi_startproc - movq %rdx, %rcx - movq (%rdx), %rax - mulq (%rsi) - movq %rdx, %r8 - movq %rax, (%rdi) - movq (%rcx), %rcx - movq %rcx, %rax - mulq 8(%rsi) - movq %rdx, %r9 - movq %rcx, %rax - mulq 16(%rsi) - movq %rdx, %r10 - movq %rcx, %rax - mulq 24(%rsi) - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %rdx, 24(%rdi) - retq - .Lfunc_end0: - .size big_mul_64x4, .Lfunc_end0-big_mul_64x4 - .cfi_endproc - - .globl hw_hi64 - .p2align 4, 0x90 - .type hw_hi64,@function - hw_hi64: - movq %rsi, %rax - retq - .Lfunc_end1: - .size hw_hi64, .Lfunc_end1-hw_hi64 - - .globl hw_lo64 - .p2align 4, 0x90 - .type hw_lo64,@function - hw_lo64: - movq %rdi, %rax - retq - .Lfunc_end2: - .size hw_lo64, .Lfunc_end2-hw_lo64 - - .globl hw_mulExt64 - .p2align 4, 0x90 - .type hw_mulExt64,@function - hw_mulExt64: - movq %rsi, %rax - mulq %rdi - retq - .Lfunc_end3: - .size hw_mulExt64, .Lfunc_end3-hw_mulExt64 - - .section ".note.GNU-stack","",@progbits - - ========================================= - ]# +t_field_add(Secp256k1)