From 4073a100751edf364fe8b5a85a573ce034dd6657 Mon Sep 17 00:00:00 2001
From: lixin <1037997956@qq.com>
Date: Thu, 21 Nov 2024 18:37:35 +0800
Subject: [PATCH] cpu-o3: replay cache missed load from replayQ

This commit is only for normal load. The uncache/amo load is the same as the original process.

Change-Id: Idc98ee18a6e94a39774ebba0f772820699b834de
---
 src/cpu/o3/dyn_inst.hh           | 13 ++++++
 src/cpu/o3/iew.cc                |  6 +++
 src/cpu/o3/iew.hh                |  3 ++
 src/cpu/o3/inst_queue.cc         | 49 +++++++++++++++++++-
 src/cpu/o3/inst_queue.hh         | 21 +++++++++
 src/cpu/o3/lsq.cc                | 79 ++++++++++++++++++++++----------
 src/mem/cache/base.cc            | 10 ++--
 src/mem/packet.hh                |  2 +
 src/mem/request.hh               |  1 +
 src/mem/ruby/system/RubyPort.cc  |  1 +
 src/mem/ruby/system/Sequencer.cc |  6 +--
 11 files changed, 160 insertions(+), 31 deletions(-)

diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index 5bee40588d..d4d1d38162 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -194,6 +194,7 @@ class DynInst : public ExecContext, public RefCounted
         NotAnInst,
         TranslationStarted,
         TranslationCompleted,
+        CacheRefilledAfterMiss,
         PossibleLoadViolation,
         HitExternalSnoop,
         EffAddrValid,
@@ -462,6 +463,14 @@ class DynInst : public ExecContext, public RefCounted
     }
     void translationCompleted(bool f) { instFlags[TranslationCompleted] = f; }
 
+    /** True if Dcache refilled after Dcache miss. */
+    bool
+    cacheRefilledAfterMiss() const
+    {
+        return instFlags[CacheRefilledAfterMiss];
+    }
+    void cacheRefilledAfterMiss(bool f) { instFlags[CacheRefilledAfterMiss] = f; }
+
     /** True if this address was found to match a previous load and they issued
      * out of order. If that happend, then it's only a problem if an incoming
      * snoop invalidate modifies the line, in which case we need to squash.
@@ -1397,6 +1406,10 @@ class DynInst : public ExecContext, public RefCounted
         return squashVer.getVersion();
     }
 
+    ssize_t getLqIdx()
+    {
+        return lqIdx;
+    }
 
     Addr getPC()
     {
diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 5557640075..a622021638 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -682,6 +682,12 @@ IEW::blockMemInst(const DynInstPtr& inst)
     instQueue.blockMemInst(inst);
 }
 
+void
+IEW::cacheMissLdReplay(const DynInstPtr& inst)
+{
+    instQueue.cacheMissLdReplay(inst);
+}
+
 void
 IEW::cacheUnblocked()
 {
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index f41dfb9492..472924a02b 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -209,6 +209,9 @@ class IEW
     /** Moves memory instruction onto the list of cache blocked instructions */
     void blockMemInst(const DynInstPtr &inst);
 
+    /** Moves load instruction onto the Set of cache missed instructions */
+    void cacheMissLdReplay(const DynInstPtr &inst);
+
     /** Notifies that the cache has become unblocked */
     void cacheUnblocked();
 
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index 96f3d2eff2..9b080c8f67 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -94,6 +94,12 @@ InstructionQueue::FUCompletion::description() const
     return "Functional unit completion";
 }
 
+size_t
+InstructionQueue::CacheMissLdInstsHash::operator()(const DynInstPtr& ptr) const
+{
+    return ptr->getLqIdx();
+}
+
 InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr,
         const BaseO3CPUParams &params)
     : cpu(cpu_ptr),
@@ -352,6 +358,7 @@ InstructionQueue::resetState()
 
     nonSpecInsts.clear();
     deferredMemInsts.clear();
+    cacheMissLdInsts.clear();
     blockedMemInsts.clear();
     retryMemInsts.clear();
     wbOutstanding = 0;
@@ -650,6 +657,10 @@ InstructionQueue::scheduleReadyInsts()
     IssueStruct *i2e_info = issueToExecuteQueue->access(0);
 
     DynInstPtr mem_inst;
+    while ((mem_inst = getCacheMissInstToExecute())) {
+        mem_inst->issueQue->retryMem(mem_inst);
+    }
+
     while ((mem_inst = getDeferredMemInstToExecute())) {
         mem_inst->issueQue->retryMem(mem_inst);
     }
@@ -720,7 +731,7 @@ InstructionQueue::scheduleReadyInsts()
     // @todo If the way deferred memory instructions are handeled due to
     // translation changes then the deferredMemInsts condition should be
     // removed from the code below.
-    if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty()) {
+    if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty() || !cacheMissLdInsts.empty()) {
         cpu->activityThisCycle();
     } else {
         DPRINTF(IQ, "Not able to schedule any instructions.\n");
@@ -859,6 +870,19 @@ InstructionQueue::deferMemInst(const DynInstPtr &deferred_inst)
     deferredMemInsts.push_back(deferred_inst);
 }
 
+void
+InstructionQueue::cacheMissLdReplay(const DynInstPtr &deferred_inst)
+{
+    DPRINTF(IQ, "Get Cache Missed Load, insert to Replay Queue "
+            "[sn:%llu]\n", deferred_inst->seqNum);
+    // Reset DTB translation state
+    deferred_inst->translationStarted(false);
+    deferred_inst->translationCompleted(false);
+
+    deferred_inst->clearCanIssue();
+    cacheMissLdInsts.insert(deferred_inst);
+}
+
 void
 InstructionQueue::blockMemInst(const DynInstPtr &blocked_inst)
 {
@@ -901,6 +925,29 @@ InstructionQueue::getDeferredMemInstToExecute()
     return nullptr;
 }
 
+DynInstPtr
+InstructionQueue::getCacheMissInstToExecute()
+{
+    for (auto it = cacheMissLdInsts.begin(); it != cacheMissLdInsts.end();
+         ++it) {
+        if ((*it)->cacheRefilledAfterMiss() || (*it)->isSquashed()) {
+            DPRINTF(IQ, "CacheMissed load inst [sn:%llu] PC %s is ready to "
+                    "execute\n", (*it)->seqNum, (*it)->pcState());
+            DynInstPtr mem_inst = std::move(*it);
+            cacheMissLdInsts.erase(it);
+            return mem_inst;
+        }
+        if (!(*it)->cacheRefilledAfterMiss()) {
+            DPRINTF(
+                IQ,
+                "CacheMissed load inst [sn:%llu] PC %s has not been waken up "
+                "by Dcache\n",
+                (*it)->seqNum, (*it)->pcState());
+        }
+    }
+    return nullptr;
+}
+
 DynInstPtr
 InstructionQueue::getBlockedMemInstToExecute()
 {
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
index 0d1b780d61..0d0f333e43 100644
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -45,6 +45,7 @@
 #include <list>
 #include <map>
 #include <queue>
+#include <unordered_set>
 #include <vector>
 
 #include "base/statistics.hh"
@@ -199,6 +200,11 @@ class InstructionQueue
      */
     DynInstPtr getDeferredMemInstToExecute();
 
+    /** Gets a load instruction that was referred due to Dcache miss
+     *  if it is now ready to execute.  NULL if none available.
+     */
+    DynInstPtr getCacheMissInstToExecute();
+
     /** Gets a memory instruction that was blocked on the cache. NULL if none
      *  available.
      */
@@ -242,6 +248,11 @@ class InstructionQueue
      */
     void deferMemInst(const DynInstPtr &deferred_inst);
 
+    /**
+     * Defers a load instruction when Dcache miss.
+     */
+    void cacheMissLdReplay(const DynInstPtr &deferred_inst);
+
     /**  Defers a memory instruction when it is cache blocked. */
     void blockMemInst(const DynInstPtr &blocked_inst);
 
@@ -302,6 +313,16 @@ class InstructionQueue
      */
     std::list<DynInstPtr> deferredMemInsts;
 
+    /** Set of load instructions waiting for Dcache refill
+     *    use unordered_set to prevent repeat enqueue,
+     *    SplitDataRequest may call `cacheMissLdReplay` multiple times.
+     */
+    struct CacheMissLdInstsHash
+    {
+      size_t operator()(const DynInstPtr& ptr) const;
+    };
+    std::unordered_set<DynInstPtr, CacheMissLdInstsHash> cacheMissLdInsts;
+
     /** List of instructions that have been cache blocked. */
     std::list<DynInstPtr> blockedMemInsts;
 
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 70878bce7b..a3dd0af112 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -521,8 +521,23 @@ LSQ::recvFunctionalCustomSignal(PacketPtr pkt, int sig)
 
     LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->getPrimarySenderState());
     panic_if(!request, "Got packet back with unknown sender state\n");
-    if (sig == DcacheRespType::Miss) {
-        // notify cache miss
+    if (sig == DcacheRespType::Miss || sig == DcacheRespType::Block_Not_Ready) {
+        DPRINTF(LSQ, "recvFunctionalCustomSignal: Resp type: %d, [sn:%ld], lqidx: %ld\n",
+                sig, request->instruction()->seqNum, request->instruction()->lqIdx);
+        if (request->mainReq()->isLLSC() || request->mainReq()->isUncacheable()) {
+            // do not replay Amo/Uncache Load
+            DPRINTF(LSQ, "Recv Amo/Uncache Load: [sn:%ld], No Need to Replay\n",
+                    request->instruction()->seqNum);
+        } else {
+            // clear state in this instruction
+            request->instruction()->cacheRefilledAfterMiss(false);
+            request->instruction()->effAddrValid(false);
+            // clear request in loadQueue
+            thread[request->_port.lsqID].loadQueue[request->instruction()->lqIdx].setRequest(nullptr);
+            // insert to missed load replay queue
+            iewStage->cacheMissLdReplay(request->instruction());
+        }
+        // cancel subsequent dependent insts of this load
         iewStage->loadCancel(request->instruction());
     } else {
         panic("unsupported sig %d in recvFunctionalCustomSignal\n", sig);
@@ -1348,21 +1363,32 @@ LSQ::SbufferRequest::recvTimingResp(PacketPtr pkt)
 bool
 LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
 {
+    LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->senderState);
+    bool isNormalLd = isLoad() && !request->mainReq()->isLLSC() && !request->mainReq()->isUncacheable();
     // Dump inst num, request addr, and packet addr
-    DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
-            pkt->getAddr());
+    DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx, isLoad: %d, "
+                "isLLSC: %d, isUncache: %d, isCacheSatisfied: %d\n",
+                pkt->req->getReqInstSeqNum(), pkt->getAddr(), isLoad(), request->mainReq()->isLLSC(),
+                request->mainReq()->isUncacheable(), pkt->cacheSatisfied);
     assert(_numOutstandingPackets == 1);
-    flags.set(Flag::Complete);
-    assert(pkt == _packets.front());
-    forward();
-    _port.completeDataAccess(pkt);
-    _hasStaleTranslation = false;
+    if (isNormalLd && !pkt->cacheSatisfied) {
+        // Data in Dcache is ready, wake up missed load in replay queue
+        LSQRequest::_inst->cacheRefilledAfterMiss(true);
+        discard();
+    } else {
+        flags.set(Flag::Complete);
+        assert(pkt == _packets.front());
+        forward();
+        _port.completeDataAccess(pkt);
+        _hasStaleTranslation = false;
+    }
     return true;
 }
 
 bool
 LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt)
 {
+    LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->senderState);
     DPRINTF(LSQ, "Spilt Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
             pkt->getAddr());
     uint32_t pktIdx = 0;
@@ -1371,21 +1397,28 @@ LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt)
     assert(pktIdx < _packets.size());
     numReceivedPackets++;
     if (numReceivedPackets == _packets.size()) {
-        flags.set(Flag::Complete);
-        /* Assemble packets. */
-        PacketPtr resp = isLoad()
-            ? Packet::createRead(_mainReq)
-            : Packet::createWrite(_mainReq);
-        if (isLoad())
-            resp->dataStatic(_inst->memData);
-        else
-            resp->dataStatic(_data);
-        resp->senderState = this;
-        forward();
-        _port.completeDataAccess(resp);
-        delete resp;
+        bool isNormalLd = isLoad() && !request->mainReq()->isLLSC() && !request->mainReq()->isUncacheable();
+        if (isNormalLd && !pkt->cacheSatisfied) {
+            // Data in Dcache is ready, wake up missed load in replay queue
+            LSQRequest::_inst->cacheRefilledAfterMiss(true);
+            discard();
+        } else {
+            flags.set(Flag::Complete);
+            /* Assemble packets. */
+            PacketPtr resp = isLoad()
+                ? Packet::createRead(_mainReq)
+                : Packet::createWrite(_mainReq);
+            if (isLoad())
+                resp->dataStatic(_inst->memData);
+            else
+                resp->dataStatic(_data);
+            resp->senderState = this;
+            forward();
+            _port.completeDataAccess(resp);
+            delete resp;
+            _hasStaleTranslation = false;
+        }
     }
-    _hasStaleTranslation = false;
     return true;
 }
 
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index bc0ca3d9b0..5c1afe48fd 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -642,14 +642,16 @@ BaseCache::recvTimingReq(PacketPtr pkt)
         }
 
         handleTimingReqHit(pkt, blk, request_time, first_acc_after_pf);
-        if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && lat > 1) {
-            // send cache miss signal
-            cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss);
+        if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && !pkt->isWrite() && lat > 1) {
+            // cache block not ready, send cancel signal
+            cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Block_Not_Ready);
+            pkt->cacheSatisfied = false;
         }
     } else {
-        if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead()) {
+        if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead() && !pkt->isWrite()) {
             // send cache miss signal
             cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss);
+            pkt->cacheSatisfied = false;
         }
 
         // ArchDB: for now we only track packet which has PC
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index a62d05de04..8964904215 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1598,6 +1598,8 @@ class Packet : public Printable
 
     bool tagReadFail = false;
 
+    bool cacheSatisfied = true;
+
     bool fromBOP() const { return pfSource == PrefetchSourceType::HWP_BOP; }
     
     PrefetchSourceType getPFSource() const { return static_cast<PrefetchSourceType>(pfSource); }
diff --git a/src/mem/request.hh b/src/mem/request.hh
index acbd793c0a..075949a2d9 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -91,6 +91,7 @@ enum DcacheRespType
 {
     NONE = 0,
     Hit,
+    Block_Not_Ready,
     Miss,
     NUM_Resp_Type
 };
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index e4ee650f85..50b97c742a 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -497,6 +497,7 @@ RubyPort::ruby_custom_signal_callback(PacketPtr pkt)
 
     DPRINTF(RubyPort, "Sent custom signal back to LSQ with sender state %#lx\n", sender_state);
     port->sendCustomSignal(pkt, DcacheRespType::Miss);
+    pkt->cacheSatisfied = false;
 }
 
 void
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 8f6213b70c..0b442ad1f5 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -383,7 +383,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
 
     if (seq_req_list.size() > 1) {
         if (cache_block_busy) {
-            if (pkt->isRead()) {
+            if (pkt->isRead() && !pkt->isWrite()) {
                 DPRINTF(RubySequencer, "Pkt %#lx %s is delayed because blk is busy doing ruby stuff\n",
                     pkt, pkt->cmdString());
                 ruby_custom_signal_callback(pkt);
@@ -649,7 +649,7 @@ Sequencer::notifyMissCallback(Addr address, bool is_upgrade, bool is_busy)
 
     // cancel pending loads' speculation
     for (auto &seq_req: seq_req_list) {
-        if (seq_req.pkt->isRead()) {
+        if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) {
             ruby_custom_signal_callback(seq_req.pkt);
             stat.loadcancel++;
         }
@@ -693,7 +693,7 @@ Sequencer::TBEFullCancel(Addr address)
 
     // cancel pending loads' speculation
     for (auto &seq_req: seq_req_list) {
-        if (seq_req.pkt->isRead()) {
+        if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) {
             ruby_custom_signal_callback(seq_req.pkt);
             stat.loadcancel++;
         }