Merge pull request #1475 from vext01/side-trace-fix

Fix a side-tracing + hardware tracing bug.
ykjit · Nov 29, 2024 · a25b17c · a25b17c
2 parents 02d7be9 + 844d090
commit a25b17c
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 27 deletions.
diff --git a/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs b/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs
@@ -252,12 +252,30 @@ impl<'a> LSRegAlloc<'a> {
 
 /// The parts of the register allocator needed for general purpose registers.
 impl LSRegAlloc<'_> {
-    /// Forcibly assign the machine register `reg`, which must be in the [RegState::Empty] state,
-    /// to the value produced by instruction `iidx`.
-    pub(crate) fn force_assign_inst_gp_reg(&mut self, iidx: InstIdx, reg: Rq) {
-        debug_assert!(!self.gp_regset.is_set(reg));
-        self.gp_regset.set(reg);
-        self.gp_reg_states[usize::from(reg.code())] = RegState::FromInst(iidx);
+    /// Forcibly assign the machine register `reg` to the value produced by instruction `iidx`.
+    /// Note that if this register is already used, a spill will be generated instead.
+    pub(crate) fn force_assign_inst_gp_reg(&mut self, asm: &mut Assembler, iidx: InstIdx, reg: Rq) {
+        if self.gp_regset.is_set(reg) {
+            debug_assert_eq!(self.spills[usize::from(iidx)], SpillState::Empty);
+            // Input values alias to a single register. To avoid the rest of the register allocator
+            // having to think about this, we "dealias" the values by spilling.
+            let inst = self.m.inst_no_copies(iidx);
+            let size = inst.def_byte_size(self.m);
+            self.stack.align(size); // FIXME
+            let frame_off = self.stack.grow(size);
+            let off = i32::try_from(frame_off).unwrap();
+            match size {
+                1 => dynasm!(asm; mov BYTE [rbp - off], Rb(reg.code())),
+                2 => dynasm!(asm; mov WORD [rbp - off], Rw(reg.code())),
+                4 => dynasm!(asm; mov DWORD [rbp - off], Rd(reg.code())),
+                8 => dynasm!(asm; mov QWORD [rbp - off], Rq(reg.code())),
+                _ => unreachable!(),
+            }
+            self.spills[usize::from(iidx)] = SpillState::Stack(off);
+        } else {
+            self.gp_regset.set(reg);
+            self.gp_reg_states[usize::from(reg.code())] = RegState::FromInst(iidx);
+        }
     }
 
     /// Forcibly assign the floating point register `reg`, which must be in the [RegState::Empty] state,

diff --git a/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs b/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs
@@ -1154,7 +1154,7 @@ impl<'a> Assemble<'a> {
         debug_assert!(size <= REG64_BYTESIZE);
         match m {
             VarLocation::Register(reg_alloc::Register::GP(reg)) => {
-                self.ra.force_assign_inst_gp_reg(iidx, reg);
+                self.ra.force_assign_inst_gp_reg(&mut self.asm, iidx, reg);
             }
             VarLocation::Register(reg_alloc::Register::FP(reg)) => {
                 self.ra.force_assign_inst_fp_reg(iidx, reg);
@@ -4472,4 +4472,33 @@ mod tests {
             false,
         );
     }
+
+    #[test]
+    fn cg_aliasing_loadtis() {
+        let mut m = jit_ir::Module::new(0, 0).unwrap();
+
+        // Create two trace inputs whose locations alias.
+        let loc = yksmp::Location::Register(13, 1, 0, [].into());
+        m.push_tiloc(loc);
+        let ti_inst = jit_ir::LoadTraceInputInst::new(0, m.int8_tyidx());
+        let op1 = m.push_and_make_operand(ti_inst.clone().into()).unwrap();
+        let op2 = m.push_and_make_operand(ti_inst.into()).unwrap();
+
+        let add_inst = jit_ir::BinOpInst::new(op1, jit_ir::BinOp::Add, op2);
+        m.push(add_inst.into()).unwrap();
+
+        let mt = MT::new().unwrap();
+        let hl = HotLocation {
+            kind: HotLocationKind::Tracing,
+            tracecompilation_errors: 0,
+        };
+
+        Assemble::new(&m, None, None)
+            .unwrap()
+            .codegen(mt, Arc::new(Mutex::new(hl)), None)
+            .unwrap()
+            .as_any()
+            .downcast::<X64CompiledTrace>()
+            .unwrap();
+    }
 }
diff --git a/ykrt/src/compile/jitc_yk/trace_builder.rs b/ykrt/src/compile/jitc_yk/trace_builder.rs
@@ -1202,29 +1202,53 @@ impl TraceBuilder {
                 })
                 .collect::<Vec<_>>();
 
-            #[cfg(tracer_swt)]
+            // When side-tracing a switch guard failure, we need to reprocess the switch statement
+            // (and only the switch statement) in order to emit a guard at the beginning of the
+            // side-trace to check that the case requiring execution is that case the trace
+            // captures.
+            //
+            // Note that it is not necessary to emit such a guard into side-traces stemming from
+            // regular conditionals, since a conditional has only two sucessors. The parent trace
+            // captures one, so by construction the side trace must capture the other.
+            let prevbb = self.aot_mod.bblock(prev_bid.as_ref().unwrap());
+            if let aot_ir::Inst::Switch {
+                test_val,
+                default_dest,
+                case_values,
+                case_dests,
+                safepoint,
+            } = &prevbb.insts.last().unwrap()
             {
-                // FIXME: This is a hack! When we side-trace the guard failure of a switch
-                // statement, the software-tracer doesn't report the switch-statement block as the
-                // first block in the trace. This means we don't generate a guard at the top of the
-                // side-trace checking that the switch-case is correct when executing the trace. We
-                // work around this force processing the previous block that's passed in via
-                // `SideTraceInfo` if it's last instruction is a switch statement. Technically,
-                // this is the correct fix for hardware-tracing too, since we shouldn't re-process
-                // the switch-statement block, as it was already executed in the parent trace,
-                // which is problematic if the block has side-effects.
-                let prevbb = self.aot_mod.bblock(prev_bid.as_ref().unwrap());
-                if matches!(prevbb.insts.last(), Some(aot_ir::Inst::Switch { .. })) {
-                    let nextbb = match &tas.first() {
-                        Some(b) => self.lookup_aot_block(b),
-                        _ => panic!(),
-                    };
-                    self.process_block(prev_bid.as_ref().unwrap(), &None, nextbb)?;
-                }
+                let nextbb = match &tas.first() {
+                    Some(b) => self.lookup_aot_block(b),
+                    _ => panic!(),
+                };
+                self.handle_switch(
+                    prev_bid.as_ref().unwrap(), // this is safe, we've just created this above
+                    prevbb.insts.len() - 1,
+                    safepoint,
+                    nextbb.as_ref().unwrap(),
+                    test_val,
+                    default_dest,
+                    case_values,
+                    case_dests,
+                )?;
             }
         }
-
+        // The variable `prev_bid` contains the block of the guard that initiated side-tracing (for
+        // normal traces this is set to `None`). When hardware tracing, we capture this block again
+        // as part of the side-trace. However, since we've already processed this block in the
+        // parent trace, we must not process it again in the side-trace.
+        //
+        // Typically, the mapper would strip this block for us, but for codegen related reasons,
+        // e.g. a switch statement codegenning to many machine blocks, it's possible for multiple
+        // duplicates of this same block to show up here, which all need to be skipped.
         let mut trace_iter = tas.into_iter().peekable();
+        if prev_bid.is_some() {
+            while self.lookup_aot_block(trace_iter.peek().unwrap()) == prev_bid {
+                trace_iter.next().unwrap();
+            }
+        }
 
         if sti.is_none() {
             // Find the block containing the control point call. This is the (sole) predecessor of the

diff --git a/ykrt/src/trace/hwt/mapper.rs b/ykrt/src/trace/hwt/mapper.rs
@@ -169,7 +169,18 @@ impl Iterator for HWTTraceIterator {
     type Item = Result<TraceAction, AOTTraceIteratorError>;
     fn next(&mut self) -> Option<Self::Item> {
         if self.tas_generated == 0 {
-            // The first block contains the control point, which we need to remove.
+            // Remove the first block.
+            //
+            // If we are collecting a top-level trace, this removes the remainder of the block
+            // containing the control point.
+            //
+            // If we are side-tracing then this attempts to remove the block containing the failed
+            // guard, which is captured by the hardware tracer, but which we have already executed
+            // in the parent trace. Note though, that some conditionals (e.g. switches) can span
+            // multiple machine blocks, which are not all removed here. Since we don't have enough
+            // information at this level to remove all of them, there's a workaround in the trace
+            // builder.
+            //
             // As a rough proxy for "check that we removed only the thing we want to remove", we know
             // that the control point will be contained in a single mappable block. The `unwrap` can
             // only fail if our assumption about the block is incorrect (i.e. some part of the system