From 4073a100751edf364fe8b5a85a573ce034dd6657 Mon Sep 17 00:00:00 2001 From: lixin <1037997956@qq.com> Date: Thu, 21 Nov 2024 18:37:35 +0800 Subject: [PATCH] cpu-o3: replay cache missed load from replayQ This commit is only for normal load. The uncache/amo load is the same as the original process. Change-Id: Idc98ee18a6e94a39774ebba0f772820699b834de --- src/cpu/o3/dyn_inst.hh | 13 ++++++ src/cpu/o3/iew.cc | 6 +++ src/cpu/o3/iew.hh | 3 ++ src/cpu/o3/inst_queue.cc | 49 +++++++++++++++++++- src/cpu/o3/inst_queue.hh | 21 +++++++++ src/cpu/o3/lsq.cc | 79 ++++++++++++++++++++++---------- src/mem/cache/base.cc | 10 ++-- src/mem/packet.hh | 2 + src/mem/request.hh | 1 + src/mem/ruby/system/RubyPort.cc | 1 + src/mem/ruby/system/Sequencer.cc | 6 +-- 11 files changed, 160 insertions(+), 31 deletions(-) diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh index 5bee40588d..d4d1d38162 100644 --- a/src/cpu/o3/dyn_inst.hh +++ b/src/cpu/o3/dyn_inst.hh @@ -194,6 +194,7 @@ class DynInst : public ExecContext, public RefCounted NotAnInst, TranslationStarted, TranslationCompleted, + CacheRefilledAfterMiss, PossibleLoadViolation, HitExternalSnoop, EffAddrValid, @@ -462,6 +463,14 @@ class DynInst : public ExecContext, public RefCounted } void translationCompleted(bool f) { instFlags[TranslationCompleted] = f; } + /** True if Dcache refilled after Dcache miss. */ + bool + cacheRefilledAfterMiss() const + { + return instFlags[CacheRefilledAfterMiss]; + } + void cacheRefilledAfterMiss(bool f) { instFlags[CacheRefilledAfterMiss] = f; } + /** True if this address was found to match a previous load and they issued * out of order. If that happend, then it's only a problem if an incoming * snoop invalidate modifies the line, in which case we need to squash. @@ -1397,6 +1406,10 @@ class DynInst : public ExecContext, public RefCounted return squashVer.getVersion(); } + ssize_t getLqIdx() + { + return lqIdx; + } Addr getPC() { diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 5557640075..a622021638 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -682,6 +682,12 @@ IEW::blockMemInst(const DynInstPtr& inst) instQueue.blockMemInst(inst); } +void +IEW::cacheMissLdReplay(const DynInstPtr& inst) +{ + instQueue.cacheMissLdReplay(inst); +} + void IEW::cacheUnblocked() { diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index f41dfb9492..472924a02b 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -209,6 +209,9 @@ class IEW /** Moves memory instruction onto the list of cache blocked instructions */ void blockMemInst(const DynInstPtr &inst); + /** Moves load instruction onto the Set of cache missed instructions */ + void cacheMissLdReplay(const DynInstPtr &inst); + /** Notifies that the cache has become unblocked */ void cacheUnblocked(); diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index 96f3d2eff2..9b080c8f67 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -94,6 +94,12 @@ InstructionQueue::FUCompletion::description() const return "Functional unit completion"; } +size_t +InstructionQueue::CacheMissLdInstsHash::operator()(const DynInstPtr& ptr) const +{ + return ptr->getLqIdx(); +} + InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) : cpu(cpu_ptr), @@ -352,6 +358,7 @@ InstructionQueue::resetState() nonSpecInsts.clear(); deferredMemInsts.clear(); + cacheMissLdInsts.clear(); blockedMemInsts.clear(); retryMemInsts.clear(); wbOutstanding = 0; @@ -650,6 +657,10 @@ InstructionQueue::scheduleReadyInsts() IssueStruct *i2e_info = issueToExecuteQueue->access(0); DynInstPtr mem_inst; + while ((mem_inst = getCacheMissInstToExecute())) { + mem_inst->issueQue->retryMem(mem_inst); + } + while ((mem_inst = getDeferredMemInstToExecute())) { mem_inst->issueQue->retryMem(mem_inst); } @@ -720,7 +731,7 @@ InstructionQueue::scheduleReadyInsts() // @todo If the way deferred memory instructions are handeled due to // translation changes then the deferredMemInsts condition should be // removed from the code below. - if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty()) { + if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty() || !cacheMissLdInsts.empty()) { cpu->activityThisCycle(); } else { DPRINTF(IQ, "Not able to schedule any instructions.\n"); @@ -859,6 +870,19 @@ InstructionQueue::deferMemInst(const DynInstPtr &deferred_inst) deferredMemInsts.push_back(deferred_inst); } +void +InstructionQueue::cacheMissLdReplay(const DynInstPtr &deferred_inst) +{ + DPRINTF(IQ, "Get Cache Missed Load, insert to Replay Queue " + "[sn:%llu]\n", deferred_inst->seqNum); + // Reset DTB translation state + deferred_inst->translationStarted(false); + deferred_inst->translationCompleted(false); + + deferred_inst->clearCanIssue(); + cacheMissLdInsts.insert(deferred_inst); +} + void InstructionQueue::blockMemInst(const DynInstPtr &blocked_inst) { @@ -901,6 +925,29 @@ InstructionQueue::getDeferredMemInstToExecute() return nullptr; } +DynInstPtr +InstructionQueue::getCacheMissInstToExecute() +{ + for (auto it = cacheMissLdInsts.begin(); it != cacheMissLdInsts.end(); + ++it) { + if ((*it)->cacheRefilledAfterMiss() || (*it)->isSquashed()) { + DPRINTF(IQ, "CacheMissed load inst [sn:%llu] PC %s is ready to " + "execute\n", (*it)->seqNum, (*it)->pcState()); + DynInstPtr mem_inst = std::move(*it); + cacheMissLdInsts.erase(it); + return mem_inst; + } + if (!(*it)->cacheRefilledAfterMiss()) { + DPRINTF( + IQ, + "CacheMissed load inst [sn:%llu] PC %s has not been waken up " + "by Dcache\n", + (*it)->seqNum, (*it)->pcState()); + } + } + return nullptr; +} + DynInstPtr InstructionQueue::getBlockedMemInstToExecute() { diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index 0d1b780d61..0d0f333e43 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "base/statistics.hh" @@ -199,6 +200,11 @@ class InstructionQueue */ DynInstPtr getDeferredMemInstToExecute(); + /** Gets a load instruction that was referred due to Dcache miss + * if it is now ready to execute. NULL if none available. + */ + DynInstPtr getCacheMissInstToExecute(); + /** Gets a memory instruction that was blocked on the cache. NULL if none * available. */ @@ -242,6 +248,11 @@ class InstructionQueue */ void deferMemInst(const DynInstPtr &deferred_inst); + /** + * Defers a load instruction when Dcache miss. + */ + void cacheMissLdReplay(const DynInstPtr &deferred_inst); + /** Defers a memory instruction when it is cache blocked. */ void blockMemInst(const DynInstPtr &blocked_inst); @@ -302,6 +313,16 @@ class InstructionQueue */ std::list deferredMemInsts; + /** Set of load instructions waiting for Dcache refill + * use unordered_set to prevent repeat enqueue, + * SplitDataRequest may call `cacheMissLdReplay` multiple times. + */ + struct CacheMissLdInstsHash + { + size_t operator()(const DynInstPtr& ptr) const; + }; + std::unordered_set cacheMissLdInsts; + /** List of instructions that have been cache blocked. */ std::list blockedMemInsts; diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 70878bce7b..a3dd0af112 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -521,8 +521,23 @@ LSQ::recvFunctionalCustomSignal(PacketPtr pkt, int sig) LSQRequest *request = dynamic_cast(pkt->getPrimarySenderState()); panic_if(!request, "Got packet back with unknown sender state\n"); - if (sig == DcacheRespType::Miss) { - // notify cache miss + if (sig == DcacheRespType::Miss || sig == DcacheRespType::Block_Not_Ready) { + DPRINTF(LSQ, "recvFunctionalCustomSignal: Resp type: %d, [sn:%ld], lqidx: %ld\n", + sig, request->instruction()->seqNum, request->instruction()->lqIdx); + if (request->mainReq()->isLLSC() || request->mainReq()->isUncacheable()) { + // do not replay Amo/Uncache Load + DPRINTF(LSQ, "Recv Amo/Uncache Load: [sn:%ld], No Need to Replay\n", + request->instruction()->seqNum); + } else { + // clear state in this instruction + request->instruction()->cacheRefilledAfterMiss(false); + request->instruction()->effAddrValid(false); + // clear request in loadQueue + thread[request->_port.lsqID].loadQueue[request->instruction()->lqIdx].setRequest(nullptr); + // insert to missed load replay queue + iewStage->cacheMissLdReplay(request->instruction()); + } + // cancel subsequent dependent insts of this load iewStage->loadCancel(request->instruction()); } else { panic("unsupported sig %d in recvFunctionalCustomSignal\n", sig); @@ -1348,21 +1363,32 @@ LSQ::SbufferRequest::recvTimingResp(PacketPtr pkt) bool LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt) { + LSQRequest *request = dynamic_cast(pkt->senderState); + bool isNormalLd = isLoad() && !request->mainReq()->isLLSC() && !request->mainReq()->isUncacheable(); // Dump inst num, request addr, and packet addr - DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(), - pkt->getAddr()); + DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx, isLoad: %d, " + "isLLSC: %d, isUncache: %d, isCacheSatisfied: %d\n", + pkt->req->getReqInstSeqNum(), pkt->getAddr(), isLoad(), request->mainReq()->isLLSC(), + request->mainReq()->isUncacheable(), pkt->cacheSatisfied); assert(_numOutstandingPackets == 1); - flags.set(Flag::Complete); - assert(pkt == _packets.front()); - forward(); - _port.completeDataAccess(pkt); - _hasStaleTranslation = false; + if (isNormalLd && !pkt->cacheSatisfied) { + // Data in Dcache is ready, wake up missed load in replay queue + LSQRequest::_inst->cacheRefilledAfterMiss(true); + discard(); + } else { + flags.set(Flag::Complete); + assert(pkt == _packets.front()); + forward(); + _port.completeDataAccess(pkt); + _hasStaleTranslation = false; + } return true; } bool LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt) { + LSQRequest *request = dynamic_cast(pkt->senderState); DPRINTF(LSQ, "Spilt Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(), pkt->getAddr()); uint32_t pktIdx = 0; @@ -1371,21 +1397,28 @@ LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt) assert(pktIdx < _packets.size()); numReceivedPackets++; if (numReceivedPackets == _packets.size()) { - flags.set(Flag::Complete); - /* Assemble packets. */ - PacketPtr resp = isLoad() - ? Packet::createRead(_mainReq) - : Packet::createWrite(_mainReq); - if (isLoad()) - resp->dataStatic(_inst->memData); - else - resp->dataStatic(_data); - resp->senderState = this; - forward(); - _port.completeDataAccess(resp); - delete resp; + bool isNormalLd = isLoad() && !request->mainReq()->isLLSC() && !request->mainReq()->isUncacheable(); + if (isNormalLd && !pkt->cacheSatisfied) { + // Data in Dcache is ready, wake up missed load in replay queue + LSQRequest::_inst->cacheRefilledAfterMiss(true); + discard(); + } else { + flags.set(Flag::Complete); + /* Assemble packets. */ + PacketPtr resp = isLoad() + ? Packet::createRead(_mainReq) + : Packet::createWrite(_mainReq); + if (isLoad()) + resp->dataStatic(_inst->memData); + else + resp->dataStatic(_data); + resp->senderState = this; + forward(); + _port.completeDataAccess(resp); + delete resp; + _hasStaleTranslation = false; + } } - _hasStaleTranslation = false; return true; } diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc index bc0ca3d9b0..5c1afe48fd 100644 --- a/src/mem/cache/base.cc +++ b/src/mem/cache/base.cc @@ -642,14 +642,16 @@ BaseCache::recvTimingReq(PacketPtr pkt) } handleTimingReqHit(pkt, blk, request_time, first_acc_after_pf); - if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && lat > 1) { - // send cache miss signal - cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss); + if (cacheLevel == 1 && pkt->isResponse() && pkt->isRead() && !pkt->isWrite() && lat > 1) { + // cache block not ready, send cancel signal + cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Block_Not_Ready); + pkt->cacheSatisfied = false; } } else { - if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead()) { + if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead() && !pkt->isWrite()) { // send cache miss signal cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss); + pkt->cacheSatisfied = false; } // ArchDB: for now we only track packet which has PC diff --git a/src/mem/packet.hh b/src/mem/packet.hh index a62d05de04..8964904215 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -1598,6 +1598,8 @@ class Packet : public Printable bool tagReadFail = false; + bool cacheSatisfied = true; + bool fromBOP() const { return pfSource == PrefetchSourceType::HWP_BOP; } PrefetchSourceType getPFSource() const { return static_cast(pfSource); } diff --git a/src/mem/request.hh b/src/mem/request.hh index acbd793c0a..075949a2d9 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -91,6 +91,7 @@ enum DcacheRespType { NONE = 0, Hit, + Block_Not_Ready, Miss, NUM_Resp_Type }; diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index e4ee650f85..50b97c742a 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -497,6 +497,7 @@ RubyPort::ruby_custom_signal_callback(PacketPtr pkt) DPRINTF(RubyPort, "Sent custom signal back to LSQ with sender state %#lx\n", sender_state); port->sendCustomSignal(pkt, DcacheRespType::Miss); + pkt->cacheSatisfied = false; } void diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc index 8f6213b70c..0b442ad1f5 100644 --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -383,7 +383,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type, if (seq_req_list.size() > 1) { if (cache_block_busy) { - if (pkt->isRead()) { + if (pkt->isRead() && !pkt->isWrite()) { DPRINTF(RubySequencer, "Pkt %#lx %s is delayed because blk is busy doing ruby stuff\n", pkt, pkt->cmdString()); ruby_custom_signal_callback(pkt); @@ -649,7 +649,7 @@ Sequencer::notifyMissCallback(Addr address, bool is_upgrade, bool is_busy) // cancel pending loads' speculation for (auto &seq_req: seq_req_list) { - if (seq_req.pkt->isRead()) { + if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) { ruby_custom_signal_callback(seq_req.pkt); stat.loadcancel++; } @@ -693,7 +693,7 @@ Sequencer::TBEFullCancel(Addr address) // cancel pending loads' speculation for (auto &seq_req: seq_req_list) { - if (seq_req.pkt->isRead()) { + if (seq_req.pkt->isRead() && !seq_req.pkt->isWrite()) { ruby_custom_signal_callback(seq_req.pkt); stat.loadcancel++; }