Merge with upstream (#187)

wasmfx · May 29, 2024 · ae86820 · ae86820
2 parents ffcb499 + 10e6c46
commit ae86820
Show file tree

Hide file tree

Showing 26 changed files with 580 additions and 451 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -117,6 +117,7 @@ similar = { workspace = true }
 libtest-mimic = "0.7.0"
 capstone = { workspace = true }
 object = { workspace = true, features = ['std'] }
+wasmtime-test-macros = { path = "crates/test-macros" }
 
 [target.'cfg(windows)'.dev-dependencies]
 windows-sys = { workspace = true, features = ["Win32_System_Memory"] }

diff --git a/cranelift/codegen/src/ir/extfunc.rs b/cranelift/codegen/src/ir/extfunc.rs
@@ -250,12 +250,6 @@ pub enum ArgumentPurpose {
     /// This is a pointer to a context struct containing details about the current sandbox. It is
     /// used as a base pointer for `vmctx` global values.
     VMContext,
-
-    /// A stack limit pointer.
-    ///
-    /// This is a pointer to a stack limit. It is used to check the current stack pointer
-    /// against. Can only appear once in a signature.
-    StackLimit,
 }
 
 impl fmt::Display for ArgumentPurpose {
@@ -265,7 +259,6 @@ impl fmt::Display for ArgumentPurpose {
             Self::StructArgument(size) => return write!(f, "sarg({})", size),
             Self::StructReturn => "sret",
             Self::VMContext => "vmctx",
-            Self::StackLimit => "stack_limit",
         })
     }
 }
@@ -277,7 +270,6 @@ impl FromStr for ArgumentPurpose {
             "normal" => Ok(Self::Normal),
             "sret" => Ok(Self::StructReturn),
             "vmctx" => Ok(Self::VMContext),
-            "stack_limit" => Ok(Self::StackLimit),
             _ if s.starts_with("sarg(") => {
                 if !s.ends_with(")") {
                     return Err(());
@@ -374,7 +366,6 @@ mod tests {
             (ArgumentPurpose::Normal, "normal"),
             (ArgumentPurpose::StructReturn, "sret"),
             (ArgumentPurpose::VMContext, "vmctx"),
-            (ArgumentPurpose::StackLimit, "stack_limit"),
             (ArgumentPurpose::StructArgument(42), "sarg(42)"),
         ];
         for &(e, n) in &all_purpose {

diff --git a/cranelift/codegen/src/ir/types.rs b/cranelift/codegen/src/ir/types.rs
@@ -158,8 +158,6 @@ impl Type {
 
     /// Get a type with the same number of lanes as this type, but with the lanes replaced by
     /// integers of the same size.
-    ///
-    /// Scalar types follow this same rule, but `b1` is converted into `i8`
     pub fn as_int(self) -> Self {
         self.replace_lanes(match self.lane_type() {
             I8 => I8,

diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -166,9 +166,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
 
             if matches!(
                 param.purpose,
-                ir::ArgumentPurpose::StructArgument(_)
-                    | ir::ArgumentPurpose::StructReturn
-                    | ir::ArgumentPurpose::StackLimit
+                ir::ArgumentPurpose::StructArgument(_) | ir::ArgumentPurpose::StructReturn
             ) {
                 assert!(
                     call_conv != isa::CallConv::Tail,

diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -2815,6 +2815,10 @@
 ;; Generates a bitcast instruction.
 ;; Args are: src, src_ty, dst_ty
 (decl gen_bitcast (Reg Type Type) Reg)
+(rule 5 (gen_bitcast r (ty_scalar_float src_ty) (ty_vec_fits_in_register _)) (rv_vfmv_sf r src_ty))
+(rule 4 (gen_bitcast r (ty_int_ref_scalar_64 src_ty) (ty_vec_fits_in_register _)) (rv_vmv_sx r src_ty))
+(rule 3 (gen_bitcast r (ty_vec_fits_in_register _) (ty_scalar_float dst_ty)) (rv_vfmv_fs r dst_ty))
+(rule 2 (gen_bitcast r (ty_vec_fits_in_register _) (ty_int_ref_scalar_64 dst_ty)) (rv_vmv_xs r dst_ty))
 (rule 1 (gen_bitcast r $F32 $I32) (rv_fmvxw r))
 (rule 1 (gen_bitcast r $F64 $I64) (rv_fmvxd r))
 (rule 1 (gen_bitcast r $I32 $F32) (rv_fmvwx r))

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -268,13 +268,16 @@ impl VecAluOpRRRR {
             VecAluOpRRRR::VfnmaccVV | VecAluOpRRRR::VfnmaccVF => 0b101101,
             VecAluOpRRRR::VfmsacVV | VecAluOpRRRR::VfmsacVF => 0b101110,
             VecAluOpRRRR::VfnmsacVV | VecAluOpRRRR::VfnmsacVF => 0b101111,
+            VecAluOpRRRR::Vslide1upVX => 0b001110,
         }
     }
 
     pub fn category(&self) -> VecOpCategory {
         match self {
             VecAluOpRRRR::VmaccVV | VecAluOpRRRR::VnmsacVV => VecOpCategory::OPMVV,
-            VecAluOpRRRR::VmaccVX | VecAluOpRRRR::VnmsacVX => VecOpCategory::OPMVX,
+            VecAluOpRRRR::VmaccVX | VecAluOpRRRR::VnmsacVX | VecAluOpRRRR::Vslide1upVX => {
+                VecOpCategory::OPMVX
+            }
             VecAluOpRRRR::VfmaccVV
             | VecAluOpRRRR::VfnmaccVV
             | VecAluOpRRRR::VfmsacVV
@@ -299,7 +302,10 @@ impl VecAluOpRRRR {
 
 impl VecInstOverlapInfo for VecAluOpRRRR {
     fn forbids_src_dst_overlaps(&self) -> bool {
-        false
+        match self {
+            VecAluOpRRRR::Vslide1upVX => true,
+            _ => false,
+        }
     }
 }
 

diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -229,6 +229,7 @@
   (VfnmaccVF)
   (VfmsacVF)
   (VfnmsacVF)
+  (Vslide1upVX)
 ))
 
 ;; Register-Imm ALU Ops
@@ -1095,6 +1096,13 @@
 (rule (rv_vslideup_vvi vd vs2 imm mask vstate)
   (vec_alu_rrr_uimm5 (VecAluOpRRRImm5.VslideupVI) vd vs2 imm mask vstate))
 
+;; Helper for emitting the `vslide1up.vx` instruction.
+;;
+;; # vd[0]=x[rs1], vd[i+1] = vs2[i]
+(decl rv_vslide1up_vx (VReg VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vslide1up_vx vd vs2 rs1 mask vstate)
+  (vec_alu_rrrr (VecAluOpRRRR.Vslide1upVX) vd vs2 rs1 mask vstate))
+
 ;; Helper for emitting the `vmv.x.s` instruction.
 ;; This instruction copies the first element of the source vector to the destination X register.
 ;; Masked versions of this instruction are not supported.

diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -2436,8 +2436,24 @@
       (elf_tls_get_addr name))
 
 ;;;;;  Rules for `bitcast`;;;;;;;;;
-(rule
-   (lower (has_type out_ty (bitcast _ v @ (value_type in_ty))))
+
+;; These rules should probably be handled in `gen_bitcast`, but it's convenient to have that return
+;; a single register, instead of a `ValueRegs`
+(rule 2 (lower (has_type $I128 (bitcast _ v @ (value_type (ty_vec_fits_in_register _)))))
+    (value_regs
+      (gen_extractlane $I64X2 v 0)
+      (gen_extractlane $I64X2 v 1)))
+
+;; Move the high half into a vector register, and then use vslide1up to move it up and
+;; insert the lower half in one instruction.
+(rule 1 (lower (has_type (ty_vec_fits_in_register _) (bitcast _ v @ (value_type $I128))))
+    (let ((lo XReg (value_regs_get v 0))
+          (hi XReg (value_regs_get v 1))
+          (vstate VState (vstate_from_type $I64X2))
+          (vec VReg (rv_vmv_sx hi vstate)))
+      (rv_vslide1up_vx vec vec lo (unmasked) vstate)))
+
+(rule 0 (lower (has_type out_ty (bitcast _ v @ (value_type in_ty))))
    (gen_bitcast v in_ty out_ty))
 
 ;;;;;  Rules for `ceil`;;;;;;;;;

diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs
@@ -1164,13 +1164,9 @@ impl<M: ABIMachineSpec> Callee<M> {
         // stack limit. This can either be specified as a special-purpose
         // argument or as a global value which often calculates the stack limit
         // from the arguments.
-        let stack_limit =
-            get_special_purpose_param_register(f, sigs, sig, ir::ArgumentPurpose::StackLimit)
-                .map(|reg| (reg, smallvec![]))
-                .or_else(|| {
-                    f.stack_limit
-                        .map(|gv| gen_stack_limit::<M>(f, sigs, sig, gv))
-                });
+        let stack_limit = f
+            .stack_limit
+            .map(|gv| gen_stack_limit::<M>(f, sigs, sig, gv));
 
         let tail_args_size = sigs[sig].sized_stack_arg_space;
 

diff --git a/cranelift/codegen/src/write.rs b/cranelift/codegen/src/write.rs
@@ -218,7 +218,7 @@ fn write_arg(w: &mut dyn Write, func: &Function, arg: Value) -> fmt::Result {
 ///
 ///    block1:
 ///    block1(v1: i32):
-///    block10(v4: f64, v5: b1):
+///    block10(v4: f64, v5: i8):
 ///
 pub fn write_block_header(
     w: &mut dyn Write,

diff --git a/cranelift/docs/compare-llvm.md b/cranelift/docs/compare-llvm.md
@@ -155,9 +155,9 @@ can hold.
 - Cranelift has no aggregate types. LLVM has named and anonymous struct types as
   well as array types.
 
-Cranelift has multiple boolean types, whereas LLVM simply uses `i1`. The sized
-Cranelift boolean types are used to represent SIMD vector masks like `b32x4`
-where each lane is either all 0 or all 1 bits.
+Cranelift uses integer-typed values of `0` or `1` for booleans, whereas LLVM
+simply uses `i1`. The sized Cranelift integer types are used to represent SIMD
+vector masks like `i32x4` where each lane is either all 0 or all 1 bits.
 
 Cranelift instructions and function calls can return multiple result values. LLVM
 instead models this by returning a single value of an aggregate type.

diff --git a/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif b/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif
@@ -15,19 +15,6 @@ block0:
 ; block0: ; offset 0x0
 ;   ret
 
-function %stack_limit_leaf_zero(i64 stack_limit) {
-block0(v0: i64):
-    return
-}
-
-; VCode:
-; block0:
-;   ret
-;
-; Disassembled:
-; block0: ; offset 0x0
-;   ret
-
 function %stack_limit_gv_leaf_zero(i64 vmctx) {
     gv0 = vmctx
     gv1 = load.i64 notrap aligned gv0
@@ -45,42 +32,6 @@ block0(v0: i64):
 ; block0: ; offset 0x0
 ;   ret
 
-function %stack_limit_call_zero(i64 stack_limit) {
-    fn0 = %foo()
-block0(v0: i64):
-    call fn0()
-    return
-}
-
-; VCode:
-;   stp fp, lr, [sp, #-16]!
-;   mov fp, sp
-;   add x16, x0, #16
-;   subs xzr, sp, x16, UXTX
-;   b.lo #trap=stk_ovf
-; block0:
-;   load_ext_name x0, TestCase(%foo)+0
-;   blr x0
-;   ldp fp, lr, [sp], #16
-;   ret
-;
-; Disassembled:
-; block0: ; offset 0x0
-;   stp x29, x30, [sp, #-0x10]!
-;   mov x29, sp
-;   add x16, x0, #0x10
-;   cmp sp, x16
-;   b.lo #0x30
-; block1: ; offset 0x14
-;   ldr x0, #0x1c
-;   b #0x24
-;   .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %foo 0
-;   .byte 0x00, 0x00, 0x00, 0x00
-;   blr x0
-;   ldp x29, x30, [sp], #0x10
-;   ret
-;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: stk_ovf
-
 function %stack_limit_gv_call_zero(i64 vmctx) {
     gv0 = vmctx
     gv1 = load.i64 notrap aligned gv0
@@ -125,87 +76,6 @@ block0(v0: i64):
 ;   ret
 ;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: stk_ovf
 
-function %stack_limit(i64 stack_limit) {
-    ss0 = explicit_slot 168
-block0(v0: i64):
-    return
-}
-
-; VCode:
-;   stp fp, lr, [sp, #-16]!
-;   mov fp, sp
-;   add x16, x0, #176
-;   subs xzr, sp, x16, UXTX
-;   b.lo #trap=stk_ovf
-;   sub sp, sp, #176
-; block0:
-;   add sp, sp, #176
-;   ldp fp, lr, [sp], #16
-;   ret
-;
-; Disassembled:
-; block0: ; offset 0x0
-;   stp x29, x30, [sp, #-0x10]!
-;   mov x29, sp
-;   add x16, x0, #0xb0
-;   cmp sp, x16
-;   b.lo #0x24
-;   sub sp, sp, #0xb0
-; block1: ; offset 0x18
-;   add sp, sp, #0xb0
-;   ldp x29, x30, [sp], #0x10
-;   ret
-;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: stk_ovf
-
-function %huge_stack_limit(i64 stack_limit) {
-    ss0 = explicit_slot 400000
-block0(v0: i64):
-    return
-}
-
-; VCode:
-;   stp fp, lr, [sp, #-16]!
-;   mov fp, sp
-;   subs xzr, sp, x0, UXTX
-;   b.lo #trap=stk_ovf
-;   movz w17, #6784
-;   movk w17, w17, #6, LSL #16
-;   add x16, x0, x17, UXTX
-;   subs xzr, sp, x16, UXTX
-;   b.lo #trap=stk_ovf
-;   movz w16, #6784
-;   movk w16, w16, #6, LSL #16
-;   sub sp, sp, x16, UXTX
-; block0:
-;   movz w16, #6784
-;   movk w16, w16, #6, LSL #16
-;   add sp, sp, x16, UXTX
-;   ldp fp, lr, [sp], #16
-;   ret
-;
-; Disassembled:
-; block0: ; offset 0x0
-;   stp x29, x30, [sp, #-0x10]!
-;   mov x29, sp
-;   cmp sp, x0
-;   b.lo #0x44
-;   mov w17, #0x1a80
-;   movk w17, #6, lsl #16
-;   add x16, x0, x17, uxtx
-;   cmp sp, x16
-;   b.lo #0x48
-;   mov w16, #0x1a80
-;   movk w16, #6, lsl #16
-;   sub sp, sp, x16
-; block1: ; offset 0x30
-;   mov w16, #0x1a80
-;   movk w16, #6, lsl #16
-;   add sp, sp, x16
-;   ldp x29, x30, [sp], #0x10
-;   ret
-;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: stk_ovf
-;   .byte 0x1f, 0xc1, 0x00, 0x00 ; trap: stk_ovf
-
 function %limit_preamble(i64 vmctx) {
     gv0 = vmctx
     gv1 = load.i64 notrap aligned gv0