From 4fd7c5300b5240eaaf1375954065c32c04ac0b31 Mon Sep 17 00:00:00 2001 From: Jacek Sieka Date: Sun, 2 Jan 2022 22:45:50 +0100 Subject: [PATCH 1/4] Backfiller Backfilling is the process of downloading historical blocks via P2P that are required to fulfill `GetBlocksByRange` duties - this happens during both trusted node and finalized checkpoint syncs. In particular, backfilling happens after syncing to head, such that attestation work can start as soon as possible. --- beacon_chain/beacon_node.nim | 1 + beacon_chain/conf.nim | 2 - .../consensus_object_pools/blockchain_dag.nim | 3 + beacon_chain/nimbus_beacon_node.nim | 39 ++- beacon_chain/sync/sync_manager.nim | 247 +++++++++--------- beacon_chain/sync/sync_queue.nim | 11 +- 6 files changed, 156 insertions(+), 147 deletions(-) diff --git a/beacon_chain/beacon_node.nim b/beacon_chain/beacon_node.nim index 6fabdd0d21..7e8eb83e5e 100644 --- a/beacon_chain/beacon_node.nim +++ b/beacon_chain/beacon_node.nim @@ -60,6 +60,7 @@ type vcProcess*: Process requestManager*: RequestManager syncManager*: SyncManager[Peer, PeerID] + backfiller*: SyncManager[Peer, PeerID] genesisSnapshotContent*: string actionTracker*: ActionTracker processor*: ref Eth2Processor diff --git a/beacon_chain/conf.nim b/beacon_chain/conf.nim index c3217998b7..9fb3df5ac2 100644 --- a/beacon_chain/conf.nim +++ b/beacon_chain/conf.nim @@ -233,12 +233,10 @@ type name: "weak-subjectivity-checkpoint" }: Option[Checkpoint] finalizedCheckpointState* {. - hidden # TODO unhide when backfilling is done desc: "SSZ file specifying a recent finalized state" name: "finalized-checkpoint-state" }: Option[InputFile] finalizedCheckpointBlock* {. - hidden # TODO unhide when backfilling is done desc: "SSZ file specifying a recent finalized block" name: "finalized-checkpoint-block" }: Option[InputFile] diff --git a/beacon_chain/consensus_object_pools/blockchain_dag.nim b/beacon_chain/consensus_object_pools/blockchain_dag.nim index 4c9a76a06a..a8246bf74d 100644 --- a/beacon_chain/consensus_object_pools/blockchain_dag.nim +++ b/beacon_chain/consensus_object_pools/blockchain_dag.nim @@ -1664,3 +1664,6 @@ proc getBlockSSZ*(dag: ChainDAGRef, id: BlockId, bytes: var seq[byte]): bool = dag.db.getAltairBlockSSZ(id.root, bytes) of BeaconBlockFork.Bellatrix: dag.db.getMergeBlockSSZ(id.root, bytes) + +func needsBackfill*(dag: ChainDAGRef): bool = + dag.backfill.slot > dag.genesis.slot diff --git a/beacon_chain/nimbus_beacon_node.nim b/beacon_chain/nimbus_beacon_node.nim index c2ea8eacbb..f5c3ab53f7 100644 --- a/beacon_chain/nimbus_beacon_node.nim +++ b/beacon_chain/nimbus_beacon_node.nim @@ -468,8 +468,11 @@ proc init*(T: type BeaconNode, taskpool) syncManager = newSyncManager[Peer, PeerID]( network.peerPool, SyncQueueKind.Forward, getLocalHeadSlot, getLocalWallSlot, - getFirstSlotAtFinalizedEpoch, getBackfillSlot, dag.tail.slot, - blockVerifier) + getFirstSlotAtFinalizedEpoch, getBackfillSlot, blockVerifier) + backfiller = newSyncManager[Peer, PeerID]( + network.peerPool, SyncQueueKind.Backward, getLocalHeadSlot, getLocalWallSlot, + getFirstSlotAtFinalizedEpoch, getBackfillSlot, blockVerifier, + maxHeadAge = 0) let stateTtlCache = if config.restCacheSize > 0: StateTtlCache.init( @@ -500,6 +503,7 @@ proc init*(T: type BeaconNode, eventBus: eventBus, requestManager: RequestManager.init(network, blockVerifier), syncManager: syncManager, + backfiller: backfiller, actionTracker: ActionTracker.init(rng, config.subscribeAllSubnets), processor: processor, blockProcessor: blockProcessor, @@ -917,6 +921,11 @@ proc onSlotEnd(node: BeaconNode, slot: Slot) {.async.} = # above, this will be done just before the next slot starts await node.updateGossipStatus(slot + 1) +proc syncStatus(node: BeaconNode): string = + if node.syncManager.inProgress: node.syncManager.syncStatus + elif node.backfiller.inProgress: "backfill: " & node.backfiller.syncStatus + else: "synced" + proc onSlotStart( node: BeaconNode, wallTime: BeaconTime, lastSlot: Slot) {.async.} = ## Called at the beginning of a slot - usually every slot, but sometimes might @@ -936,9 +945,7 @@ proc onSlotStart( info "Slot start", slot = shortLog(wallSlot), epoch = shortLog(wallSlot.epoch), - sync = - if node.syncManager.inProgress: node.syncManager.syncStatus - else: "synced", + sync = node.syncStatus(), peers = len(node.network.peerPool), head = shortLog(node.dag.head), finalized = shortLog(getStateField( @@ -1127,6 +1134,18 @@ proc stop(node: BeaconNode) = node.db.close() notice "Databases closed" +proc startBackfillTask(node: BeaconNode) {.async.} = + while node.dag.needsBackfill: + if not node.syncManager.inProgress: + # Only start the backfiller if it's needed _and_ head sync has completed - + # if we lose sync after having synced head, we could stop the backfilller, + # but this should be a fringe case - might as well keep the logic simple for + # now + node.backfiller.start() + return + + await sleepAsync(chronos.seconds(2)) + proc run(node: BeaconNode) {.raises: [Defect, CatchableError].} = bnStatus = BeaconNodeStatus.Running @@ -1150,6 +1169,8 @@ proc run(node: BeaconNode) {.raises: [Defect, CatchableError].} = node.requestManager.start() node.syncManager.start() + if node.dag.needsBackfill(): asyncSpawn node.startBackfillTask() + waitFor node.updateGossipStatus(wallSlot) asyncSpawn runSlotLoop(node, wallTime, onSlotStart) @@ -1327,13 +1348,7 @@ proc initStatusBar(node: BeaconNode) {.raises: [Defect, ValueError].} = formatGwei(node.attachedValidatorBalanceTotal) of "sync_status": - if isNil(node.syncManager): - "pending" - else: - if node.syncManager.inProgress: - node.syncManager.syncStatus - else: - "synced" + node.syncStatus() else: # We ignore typos for now and just render the expression # as it was written. TODO: come up with a good way to show diff --git a/beacon_chain/sync/sync_manager.nim b/beacon_chain/sync/sync_manager.nim index 070e7ad9b2..68ff360da6 100644 --- a/beacon_chain/sync/sync_manager.nim +++ b/beacon_chain/sync/sync_manager.nim @@ -171,146 +171,73 @@ proc getBlocks*[A, B](man: SyncManager[A, B], peer: A, topics = "syncman", error = $res.error() result = res.map() do (blcks: seq[phase0.SignedBeaconBlock]) -> auto: blcks.mapIt(ForkedSignedBeaconBlock.init(it)) -template headAge(): uint64 = - wallSlot - headSlot - -template peerStatusAge(): Duration = - Moment.now() - peer.state(BeaconSync).statusLastTime +proc remainingSlots(man: SyncManager): uint64 = + if man.direction == SyncQueueKind.Forward: + man.getLastSlot() - man.getFirstSlot() + else: + man.getFirstSlot() - man.getLastSlot() proc syncStep[A, B](man: SyncManager[A, B], index: int, peer: A) {.async.} = - let wallSlot = man.getLocalWallSlot() - let headSlot = man.getLocalHeadSlot() - var peerSlot = peer.getHeadSlot() - - debug "Peer's syncing status", wall_clock_slot = wallSlot, - remote_head_slot = peerSlot, local_head_slot = headSlot, - peer_score = peer.getScore(), peer = peer, index = index, - peer_speed = peer.netKbps(), direction = man.direction, - topics = "syncman" - - # Check if peer's head slot is bigger than our wall clock slot. - if peerSlot > wallSlot + man.toleranceValue: - peer.updateScore(PeerScoreHeadTooNew) - - warn "Peer reports a head newer than our wall clock - clock out of sync?", - wall_clock_slot = wallSlot, remote_head_slot = peerSlot, - local_head_slot = headSlot, peer = peer, index = index, - tolerance_value = man.toleranceValue, peer_speed = peer.netKbps(), - peer_score = peer.getScore(), direction = man.direction, - topics = "syncman" - return + var + headSlot = man.getLocalHeadSlot() + wallSlot = man.getLocalWallSlot() + peerSlot = peer.getHeadSlot() - # Check if we need to update peer's status information - if peerStatusAge >= StatusExpirationTime: - # Peer's status information is very old, its time to update it - man.workers[index].status = SyncWorkerStatus.UpdatingStatus - trace "Updating peer's status information", wall_clock_slot = wallSlot, + block: # Check that peer status is recent and relevant + debug "Peer's syncing status", wall_clock_slot = wallSlot, remote_head_slot = peerSlot, local_head_slot = headSlot, - peer = peer, peer_score = peer.getScore(), index = index, + peer_score = peer.getScore(), peer = peer, index = index, peer_speed = peer.netKbps(), direction = man.direction, topics = "syncman" - try: - let res = await peer.updateStatus() - if not(res): - peer.updateScore(PeerScoreNoStatus) - debug "Failed to get remote peer's status, exiting", peer = peer, - peer_score = peer.getScore(), peer_head_slot = peerSlot, - peer_speed = peer.netKbps(), index = index, - direction = man.direction, topics = "syncman" - return - except CatchableError as exc: - debug "Unexpected exception while updating peer's status", - peer = peer, peer_score = peer.getScore(), - peer_head_slot = peerSlot, peer_speed = peer.netKbps(), - index = index, errMsg = exc.msg, direction = man.direction, - topics = "syncman" - return - - let newPeerSlot = peer.getHeadSlot() - if peerSlot >= newPeerSlot: - peer.updateScore(PeerScoreStaleStatus) - debug "Peer's status information is stale", - wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, - local_head_slot = headSlot, remote_new_head_slot = newPeerSlot, + let + peerStatusAge = Moment.now() - peer.state(BeaconSync).statusLastTime + needsUpdate = + # Latest status we got is old + peerStatusAge >= StatusExpirationTime or + # The point we need to sync is close to where the peer is + man.getFirstSlot() >= peerSlot + + if needsUpdate: + man.workers[index].status = SyncWorkerStatus.UpdatingStatus + + # Avoid a stampede of requests, but make them more frequent in case the + # peer is "close" to the slot range of interest + if peerStatusAge < StatusExpirationTime div 2: + await sleepAsync(StatusExpirationTime div 2 - peerStatusAge) + + trace "Updating peer's status information", wall_clock_slot = wallSlot, + remote_head_slot = peerSlot, local_head_slot = headSlot, peer = peer, peer_score = peer.getScore(), index = index, peer_speed = peer.netKbps(), direction = man.direction, topics = "syncman" - else: - debug "Peer's status information updated", wall_clock_slot = wallSlot, - remote_old_head_slot = peerSlot, local_head_slot = headSlot, - remote_new_head_slot = newPeerSlot, peer = peer, - peer_score = peer.getScore(), peer_speed = peer.netKbps(), - index = index, direction = man.direction, topics = "syncman" - peer.updateScore(PeerScoreGoodStatus) - peerSlot = newPeerSlot - - if headAge <= man.maxHeadAge: - info "We are in sync with network", wall_clock_slot = wallSlot, - remote_head_slot = peerSlot, local_head_slot = headSlot, - peer = peer, peer_score = peer.getScore(), index = index, - peer_speed = peer.netKbps(), topics = "syncman" - # We clear SyncManager's `notInSyncEvent` so all the workers will become - # sleeping soon. - man.notInSyncEvent.clear() - return - - if headSlot >= peerSlot - man.maxHeadAge: - debug "We are in sync with peer; refreshing peer's status information", - wall_clock_slot = wallSlot, remote_head_slot = peerSlot, - local_head_slot = headSlot, peer = peer, peer_score = peer.getScore(), - index = index, peer_speed = peer.netKbps(), direction = man.direction, - topics = "syncman" - - man.workers[index].status = SyncWorkerStatus.UpdatingStatus - if peerStatusAge <= StatusUpdateInterval: - await sleepAsync(StatusUpdateInterval - peerStatusAge) - - try: - let res = await peer.updateStatus() - if not(res): - peer.updateScore(PeerScoreNoStatus) - debug "Failed to get remote peer's status, exiting", peer = peer, - peer_score = peer.getScore(), peer_head_slot = peerSlot, - peer_speed = peer.netKbps(), index = index, - direction = man.direction, topics = "syncman" + try: + let res = await peer.updateStatus() + if not(res): + peer.updateScore(PeerScoreNoStatus) + debug "Failed to get remote peer's status, exiting", peer = peer, + peer_score = peer.getScore(), peer_head_slot = peerSlot, + peer_speed = peer.netKbps(), index = index, + direction = man.direction, topics = "syncman" + return + except CatchableError as exc: + debug "Unexpected exception while updating peer's status", + peer = peer, peer_score = peer.getScore(), + peer_head_slot = peerSlot, peer_speed = peer.netKbps(), + index = index, errMsg = exc.msg, direction = man.direction, + topics = "syncman" return - except CatchableError as exc: - debug "Unexpected exception while updating peer's status", - peer = peer, peer_score = peer.getScore(), - peer_head_slot = peerSlot, peer_speed = peer.netKbps(), - index = index, errMsg = exc.msg, direction = man.direction, - topics = "syncman" - return - let newPeerSlot = peer.getHeadSlot() - if peerSlot >= newPeerSlot: - peer.updateScore(PeerScoreStaleStatus) - debug "Peer's status information is stale", - wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, - local_head_slot = headSlot, remote_new_head_slot = newPeerSlot, - peer = peer, peer_score = peer.getScore(), index = index, - peer_speed = peer.netKbps(), direction = man.direction, - topics = "syncman" - else: - # This is not very good solution because we should not discriminate and/or - # penalize peers which are in sync process too, but their latest head is - # lower then our latest head. We should keep connections with such peers - # (so this peers are able to get in sync using our data), but we should - # not use this peers for syncing because this peers are useless for us. - # Right now we decreasing peer's score a bit, so it will not be - # disconnected due to low peer's score, but new fresh peers could replace - # peers with low latest head. - if headSlot >= newPeerSlot - man.maxHeadAge: - # Peer's head slot is still lower then ours. - debug "Peer's head slot is lower then local head slot", + let newPeerSlot = peer.getHeadSlot() + if peerSlot >= newPeerSlot: + peer.updateScore(PeerScoreStaleStatus) + debug "Peer's status information is stale", wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, local_head_slot = headSlot, remote_new_head_slot = newPeerSlot, - peer = peer, peer_score = peer.getScore(), - peer_speed = peer.netKbps(), index = index, - direction = man.direction, topics = "syncman" - peer.updateScore(PeerScoreUseless) + peer = peer, peer_score = peer.getScore(), index = index, + peer_speed = peer.netKbps(), direction = man.direction, + topics = "syncman" else: debug "Peer's status information updated", wall_clock_slot = wallSlot, remote_old_head_slot = peerSlot, local_head_slot = headSlot, @@ -320,10 +247,74 @@ proc syncStep[A, B](man: SyncManager[A, B], index: int, peer: A) {.async.} = peer.updateScore(PeerScoreGoodStatus) peerSlot = newPeerSlot + # Time passed - enough to move slots, if sleep happened + headSlot = man.getLocalHeadSlot() + wallSlot = man.getLocalWallSlot() + + if peerSlot > wallSlot + man.toleranceValue: + # If the peer reports a head slot higher than our wall slot, something is + # wrong: our clock is off or the peer is on a different network (or + # dishonest) + peer.updateScore(PeerScoreHeadTooNew) + + warn "Peer reports a head newer than our wall clock - clock out of sync?", + wall_clock_slot = wallSlot, remote_head_slot = peerSlot, + local_head_slot = headSlot, peer = peer, index = index, + tolerance_value = man.toleranceValue, peer_speed = peer.netKbps(), + peer_score = peer.getScore(), direction = man.direction, + topics = "syncman" + return + + if man.remainingSlots() <= man.maxHeadAge: + case man.direction + of SyncQueueKind.Forward: + info "We are in sync with network", wall_clock_slot = wallSlot, + remote_head_slot = peerSlot, local_head_slot = headSlot, + peer = peer, peer_score = peer.getScore(), index = index, + peer_speed = peer.netKbps(), direction = man.direction, + topics = "syncman" + of SyncQueueKind.Backward: + info "Backfill complete", wall_clock_slot = wallSlot, + remote_head_slot = peerSlot, local_head_slot = headSlot, + peer = peer, peer_score = peer.getScore(), index = index, + peer_speed = peer.netKbps(), direction = man.direction, + topics = "syncman" + + # We clear SyncManager's `notInSyncEvent` so all the workers will become + # sleeping soon. + man.notInSyncEvent.clear() + return + + # Find out if the peer potentially can give useful blocks - in the case of + # forward sync, they can be useful if they have blocks newer than our head - + # in the case of backwards sync, they're useful if they have blocks newer than + # the backfill point + if man.getFirstSlot() >= peerSlot: + # This is not very good solution because we should not discriminate and/or + # penalize peers which are in sync process too, but their latest head is + # lower then our latest head. We should keep connections with such peers + # (so this peers are able to get in sync using our data), but we should + # not use this peers for syncing because this peers are useless for us. + # Right now we decreasing peer's score a bit, so it will not be + # disconnected due to low peer's score, but new fresh peers could replace + # peers with low latest head. + debug "Peer's head slot is lower then local head slot", + wall_clock_slot = wallSlot, remote_head_slot = peerSlot, + local_last_slot = man.getLastSlot(), + local_first_slot = man.getFirstSlot(), peer = peer, + peer_score = peer.getScore(), + peer_speed = peer.netKbps(), index = index, + direction = man.direction, topics = "syncman" + peer.updateScore(PeerScoreUseless) + + # Give the peer time to do some syncing + await sleepAsync(StatusExpirationTime div 2) + return - # We updating SyncQueue's last slot all the time - man.queue.updateLastSlot(man.getLastSlot()) + if man.direction == SyncQueueKind.Forward: + # Wall clock keeps ticking, so we need to update the queue + man.queue.updateLastSlot(man.getLastSlot()) man.workers[index].status = SyncWorkerStatus.Requesting let req = man.queue.pop(peerSlot, peer) @@ -573,7 +564,7 @@ proc syncLoop[A, B](man: SyncManager[A, B]) {.async.} = man.avgSyncSpeed.formatBiggestFloat(ffDecimal, 4) & "slots/s (" & map & ":" & $man.queue.outSlot & ")" - if headAge <= man.maxHeadAge: + if man.remainingSlots() <= man.maxHeadAge: man.notInSyncEvent.clear() # We are marking SyncManager as not working only when we are in sync and # all sync workers are in `Sleeping` state. diff --git a/beacon_chain/sync/sync_queue.nim b/beacon_chain/sync/sync_queue.nim index 318ddf0e3e..c28f0466b7 100644 --- a/beacon_chain/sync/sync_queue.nim +++ b/beacon_chain/sync/sync_queue.nim @@ -491,7 +491,7 @@ proc notInRange[T](sq: SyncQueue[T], sr: SyncRequest[T]): bool = of SyncQueueKind.Forward: (sq.queueSize > 0) and (sr.slot != sq.outSlot) of SyncQueueKind.Backward: - (sq.queueSize > 0) and (sr.slot + sr.count - 1'u64 != sq.outSlot) + (sq.queueSize > 0) and (sr.slot + sr.count != sq.outSlot) proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], data: seq[ForkedSignedBeaconBlock], @@ -532,7 +532,7 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], some(sq.readyQueue.pop()) of SyncQueueKind.Backward: let maxSlot = sq.readyQueue[0].request.slot + - (sq.readyQueue[0].request.count - 1'u64) + sq.readyQueue[0].request.count if sq.outSlot != maxSlot: none[SyncResult[T]]() else: @@ -630,7 +630,8 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], of SyncQueueKind.Backward: if safeSlot > req.slot: let rewindSlot = sq.getRewindPoint(failSlot.get(), safeSlot) - warn "Unexpected missing parent, rewind happens", + # It's quite common peers give us fewer blocks than we ask for + info "Gap in block range response, rewinding", peer = req.item, rewind_to_slot = rewindSlot, rewind_fail_slot = failSlot.get(), finalized_slot = safeSlot, @@ -716,11 +717,11 @@ proc pop*[T](sq: SyncQueue[T], maxslot: Slot, item: T): SyncRequest[T] = of SyncQueueKind.Backward: if sq.inpSlot == 0xFFFF_FFFF_FFFF_FFFF'u64: return SyncRequest.empty(sq.kind, T) - if sq.inpSlot < sq.finalSlot: + if sq.inpSlot <= sq.finalSlot: return SyncRequest.empty(sq.kind, T) let (slot, count) = block: - let baseSlot = sq.inpSlot + 1'u64 + let baseSlot = sq.inpSlot if baseSlot - sq.finalSlot < sq.chunkSize: let count = uint64(baseSlot - sq.finalSlot) (baseSlot - count, count) From df72fecdcd7b92b29c192a0ab92b8a439b21199f Mon Sep 17 00:00:00 2001 From: cheatfate Date: Mon, 17 Jan 2022 07:07:33 +0200 Subject: [PATCH 2/4] Fix SyncQueue initialization procedure. Remove usage of `awaitne`. Add cancellation support. Remove unneeded `sleepAsync()` if peer's head is older than needed. Add `direction` field to all logs. Fix syncmanager wedge issue. Add proper resource cleaning procedure on backward sync finish. --- beacon_chain/sync/sync_manager.nim | 219 +++++++++++++++++++++-------- beacon_chain/sync/sync_queue.nim | 52 ++++--- 2 files changed, 194 insertions(+), 77 deletions(-) diff --git a/beacon_chain/sync/sync_manager.nim b/beacon_chain/sync/sync_manager.nim index 68ff360da6..b7c6527802 100644 --- a/beacon_chain/sync/sync_manager.nim +++ b/beacon_chain/sync/sync_manager.nim @@ -89,9 +89,24 @@ proc speed*(start, finish: SyncMoment): float {.inline.} = slots / dur proc initQueue[A, B](man: SyncManager[A, B]) = - man.queue = SyncQueue.init(A, man.direction, man.getFirstSlot(), - man.getLastSlot(), man.chunkSize, - man.getSafeSlot, man.blockVerifier, 1) + case man.direction + of SyncQueueKind.Forward: + man.queue = SyncQueue.init(A, man.direction, man.getFirstSlot(), + man.getLastSlot(), man.chunkSize, + man.getSafeSlot, man.blockVerifier, 1) + of SyncQueueKind.Backward: + let + firstSlot = man.getFirstSlot() + lastSlot = man.getLastSlot() + startSlot = if firstSlot == lastSlot: + # This case should never be happened in real life because + # there is present check `needsBackfill(). + firstSlot + else: + Slot(firstSlot - 1'u64) + man.queue = SyncQueue.init(A, man.direction, firstSlot, lastSlot, + man.chunkSize, man.getSafeSlot, + man.blockVerifier, 1) proc newSyncManager*[A, B](pool: PeerPool[A, B], direction: SyncQueueKind, @@ -112,9 +127,10 @@ proc newSyncManager*[A, B](pool: PeerPool[A, B], of SyncQueueKind.Forward: (getLocalHeadSlotCb, getLocalWallSlotCb, getFinalizedSlotCb) of SyncQueueKind.Backward: - (getBackfillSlotCb, GetSlotCallback(proc(): Slot = Slot(0)), getBackfillSlotCb) + (getBackfillSlotCb, GetSlotCallback(proc(): Slot = Slot(0)), + getBackfillSlotCb) - result = SyncManager[A, B]( + var res = SyncManager[A, B]( pool: pool, maxStatusAge: maxStatusAge, getLocalHeadSlot: getLocalHeadSlotCb, @@ -130,7 +146,8 @@ proc newSyncManager*[A, B](pool: PeerPool[A, B], notInSyncEvent: newAsyncEvent(), direction: direction ) - result.initQueue() + res.initQueue() + res proc getBlocks*[A, B](man: SyncManager[A, B], peer: A, req: SyncRequest): Future[BeaconBlocksRes] {.async.} = @@ -141,35 +158,56 @@ proc getBlocks*[A, B](man: SyncManager[A, B], peer: A, peer_score = peer.getScore(), peer_speed = peer.netKbps(), direction = man.direction, topics = "syncman" if peer.useSyncV2(): - var workFut = awaitne beaconBlocksByRange_v2(peer, req.slot, req.count, req.step) - if workFut.failed(): - debug "Error, while waiting getBlocks response", peer = peer, - slot = req.slot, slot_count = req.count, step = req.step, - errMsg = workFut.readError().msg, peer_speed = peer.netKbps(), - direction = man.direction, topics = "syncman" - else: - let res = workFut.read() - if res.isErr: - debug "Error, while reading getBlocks response", + let res = + try: + await beaconBlocksByRange_v2(peer, req.slot, req.count, req.step) + except CancelledError: + debug "Interrupt, while waiting getBlocks response", peer = peer, + slot = req.slot, slot_count = req.count, step = req.step, + peer_speed = peer.netKbps(), direction = man.direction, + topics = "syncman" + return + except CatchableError as exc: + debug "Error, while waiting getBlocks response", peer = peer, + slot = req.slot, slot_count = req.count, step = req.step, + errName = exc.name, errMsg = exc.msg, peer_speed = peer.netKbps(), + direction = man.direction, topics = "syncman" + return + if res.isErr(): + debug "Error, while reading getBlocks response", peer = peer, slot = req.slot, count = req.count, step = req.step, peer_speed = peer.netKbps(), - direction = man.direction, topics = "syncman", error = $res.error() - result = res + direction = man.direction, topics = "syncman", + error = $res.error() + return + return res else: - var workFut = awaitne beaconBlocksByRange(peer, req.slot, req.count, req.step) - if workFut.failed(): - debug "Error, while waiting getBlocks response", peer = peer, - slot = req.slot, slot_count = req.count, step = req.step, - errMsg = workFut.readError().msg, peer_speed = peer.netKbps(), - direction = man.direction, topics = "syncman" - else: - let res = workFut.read() - if res.isErr: - debug "Error, while reading getBlocks response", - peer = peer, slot = req.slot, count = req.count, - step = req.step, peer_speed = peer.netKbps(), - topics = "syncman", error = $res.error() - result = res.map() do (blcks: seq[phase0.SignedBeaconBlock]) -> auto: blcks.mapIt(ForkedSignedBeaconBlock.init(it)) + let res = + try: + await beaconBlocksByRange(peer, req.slot, req.count, req.step) + except CancelledError: + debug "Interrupt, while waiting getBlocks response", peer = peer, + slot = req.slot, slot_count = req.count, step = req.step, + peer_speed = peer.netKbps(), direction = man.direction, + topics = "syncman" + return + except CatchableError as exc: + debug "Error, while waiting getBlocks response", peer = peer, + slot = req.slot, slot_count = req.count, step = req.step, + errName = exc.name, errMsg = exc.msg, peer_speed = peer.netKbps(), + direction = man.direction, topics = "syncman" + return + if res.isErr(): + debug "Error, while reading getBlocks response", + peer = peer, slot = req.slot, count = req.count, + step = req.step, peer_speed = peer.netKbps(), + direction = man.direction, error = $res.error(), + topics = "syncman" + return + let forked = + res.map() do (blcks: seq[phase0.SignedBeaconBlock]) -> auto: + blcks.mapIt(ForkedSignedBeaconBlock.init(it)) + return forked proc remainingSlots(man: SyncManager): uint64 = if man.direction == SyncQueueKind.Forward: @@ -306,10 +344,6 @@ proc syncStep[A, B](man: SyncManager[A, B], index: int, peer: A) {.async.} = peer_speed = peer.netKbps(), index = index, direction = man.direction, topics = "syncman" peer.updateScore(PeerScoreUseless) - - # Give the peer time to do some syncing - await sleepAsync(StatusExpirationTime div 2) - return if man.direction == SyncQueueKind.Forward: @@ -348,7 +382,7 @@ proc syncStep[A, B](man: SyncManager[A, B], index: int, peer: A) {.async.} = try: let blocks = await man.getBlocks(peer, req) - if blocks.isOk: + if blocks.isOk(): let data = blocks.get() let smap = getShortMap(req, data) debug "Received blocks on request", blocks_count = len(data), @@ -384,26 +418,47 @@ proc syncStep[A, B](man: SyncManager[A, B], index: int, peer: A) {.async.} = except CatchableError as exc: debug "Unexpected exception while receiving blocks", - request_slot = req.slot, request_count = req.count, - request_step = req.step, peer = peer, index = index, - peer_score = peer.getScore(), peer_speed = peer.netKbps(), - errMsg = exc.msg, direction = man.direction, topics = "syncman" + request_slot = req.slot, request_count = req.count, + request_step = req.step, peer = peer, index = index, + peer_score = peer.getScore(), peer_speed = peer.netKbps(), + errName = exc.name, errMsg = exc.msg, direction = man.direction, + topics = "syncman" return proc syncWorker[A, B](man: SyncManager[A, B], index: int) {.async.} = mixin getKey, getScore, getHeadSlot - debug "Starting syncing worker", - index = index, direction = man.direction, topics = "syncman" + debug "Starting syncing worker", index = index, direction = man.direction, + topics = "syncman" while true: - man.workers[index].status = SyncWorkerStatus.Sleeping - # This event is going to be set until we are not in sync with network - await man.notInSyncEvent.wait() - man.workers[index].status = SyncWorkerStatus.WaitingPeer - let peer = await man.pool.acquire() - await man.syncStep(index, peer) - man.pool.release(peer) + var peer: A = nil + let doBreak = + try: + man.workers[index].status = SyncWorkerStatus.Sleeping + # This event is going to be set until we are not in sync with network + await man.notInSyncEvent.wait() + man.workers[index].status = SyncWorkerStatus.WaitingPeer + peer = await man.pool.acquire() + await man.syncStep(index, peer) + man.pool.release(peer) + false + except CancelledError: + if not(isNil(peer)): + man.pool.release(peer) + true + except CatchableError as exc: + debug "Unexpected exception in sync worker", + peer = peer, index = index, + peer_score = peer.getScore(), peer_speed = peer.netKbps(), + errName = exc.name, errMsg = exc.msg, direction = man.direction, + topics = "syncman" + true + if doBreak: + break + + debug "Sync worker stopped", index = index, direction = man.direction, + topics = "syncman" proc getWorkersStats[A, B](man: SyncManager[A, B]): tuple[map: string, sleeping: int, @@ -453,7 +508,8 @@ proc guardTask[A, B](man: SyncManager[A, B]) {.async.} = let index = pending.find(failFuture) if failFuture.failed(): warn "Synchronization worker stopped working unexpectedly with an error", - index = index, errMsg = failFuture.error.msg, direction = man.direction + index = index, errMsg = failFuture.error.msg, + direction = man.direction else: warn "Synchronization worker stopped working unexpectedly without error", index = index, direction = man.direction @@ -492,9 +548,10 @@ proc syncLoop[A, B](man: SyncManager[A, B]) {.async.} = mixin getKey, getScore var pauseTime = 0 - asyncSpawn man.guardTask() + var guardTaskFut = man.guardTask() - debug "Synchronization loop started", topics = "syncman" + debug "Synchronization loop started", topics = "syncman", + direction = man.direction proc averageSpeedTask() {.async.} = while true: @@ -526,7 +583,7 @@ proc syncLoop[A, B](man: SyncManager[A, B]) {.async.} = stamp = newStamp - asyncSpawn averageSpeedTask() + var averageSpeedTaskFut = averageSpeedTask() while true: let wallSlot = man.getLocalWallSlot() @@ -575,13 +632,51 @@ proc syncLoop[A, B](man: SyncManager[A, B]) {.async.} = sleeping_workers_count = sleeping, waiting_workers_count = waiting, pending_workers_count = pending, direction = man.direction, topics = "syncman" + # We already synced, so we should reset all the pending workers from + # any state they have. + man.queue.clearAndWakeup() man.inProgress = true else: - debug "Synchronization loop sleeping", wall_head_slot = wallSlot, - local_head_slot = headSlot, difference = (wallSlot - headSlot), - max_head_age = man.maxHeadAge, direction = man.direction, - topics = "syncman" - man.inProgress = false + case man.direction + of SyncQueueKind.Forward: + if man.inProgress: + man.inProgress = false + debug "Forward synchronization process finished, sleeping", + wall_head_slot = wallSlot, local_head_slot = headSlot, + difference = (wallSlot - headSlot), + max_head_age = man.maxHeadAge, direction = man.direction, + topics = "syncman" + else: + debug "Synchronization loop sleeping", wall_head_slot = wallSlot, + local_head_slot = headSlot, + difference = (wallSlot - headSlot), + max_head_age = man.maxHeadAge, direction = man.direction, + topics = "syncman" + of SyncQueueKind.Backward: + # Backward syncing is going to be executed only once, so we exit loop + # and stop all pending tasks which belongs to this instance (sync + # workers, guard task and speed calculation task). + # We first need to cancel and wait for guard task, because otherwise + # it will be able to restore cancelled workers. + guardTaskFut.cancel() + averageSpeedTaskFut.cancel() + await allFutures(guardTaskFut, averageSpeedTaskFut) + let pendingTasks = + block: + var res: seq[Future[void]] + for worker in man.workers: + # Because `pending == 0` there should be no active workers. + doAssert(worker.status in {Sleeping, WaitingPeer}) + worker.future.cancel() + res.add(worker.future) + res + await allFutures(pendingTasks) + debug "Backward synchronization process finished, exiting", + wall_head_slot = wallSlot, local_head_slot = headSlot, + backfill_slot = man.getLastSlot(), + max_head_age = man.maxHeadAge, direction = man.direction, + topics = "syncman" + break else: if not(man.notInSyncEvent.isSet()): # We get here only if we lost sync for more then `maxHeadAge` period. @@ -589,6 +684,12 @@ proc syncLoop[A, B](man: SyncManager[A, B]) {.async.} = man.initQueue() man.notInSyncEvent.fire() man.inProgress = true + debug "Node lost sync for more then preset period", + period = man.maxHeadAge, wall_head_slot = wallSlot, + local_head_slot = headSlot, + missing_slots = man.remainingSlots(), + progress = float(man.queue.progress()), + topics = "syncman" else: man.notInSyncEvent.fire() man.inProgress = true diff --git a/beacon_chain/sync/sync_queue.nim b/beacon_chain/sync/sync_queue.nim index c28f0466b7..7bf1233ceb 100644 --- a/beacon_chain/sync/sync_queue.nim +++ b/beacon_chain/sync/sync_queue.nim @@ -76,6 +76,8 @@ type SyncManagerError* = object of CatchableError BeaconBlocksRes* = NetRes[seq[ForkedSignedBeaconBlock]] +chronicles.formatIt SyncQueueKind: $it + proc getShortMap*[T](req: SyncRequest[T], data: openArray[ForkedSignedBeaconBlock]): string = ## Returns all slot numbers in ``data`` as placement map. @@ -303,6 +305,10 @@ proc wakeupAndWaitWaiters[T](sq: SyncQueue[T]) {.async.} = sq.wakeupWaiters(true) discard await waitChanges +proc clearAndWakeup*[T](sq: SyncQueue[T]) = + sq.pending.clear() + sq.wakeupWaiters(true) + proc resetWait*[T](sq: SyncQueue[T], toSlot: Option[Slot]) {.async.} = ## Perform reset of all the blocked waiters in SyncQueue. ## @@ -409,7 +415,8 @@ proc getRewindPoint*[T](sq: SyncQueue[T], failSlot: Slot, finalized_slot = safeSlot, fail_slot = failSlot, finalized_epoch = finalizedEpoch, fail_epoch = failEpoch, rewind_epoch_count = rewind.epochCount, - finalized_epoch = finalizedEpoch + finalized_epoch = finalizedEpoch, direction = sq.kind, + topics = "syncman" 0'u64 else: # `MissingParent` happened at different slot so we going to rewind for @@ -419,7 +426,8 @@ proc getRewindPoint*[T](sq: SyncQueue[T], failSlot: Slot, finalized_slot = safeSlot, fail_slot = failSlot, finalized_epoch = finalizedEpoch, fail_epoch = failEpoch, rewind_epoch_count = rewind.epochCount, - finalized_epoch = finalizedEpoch + finalized_epoch = finalizedEpoch, direction = sq.kind, + topics = "syncman" 0'u64 else: 1'u64 @@ -429,7 +437,8 @@ proc getRewindPoint*[T](sq: SyncQueue[T], failSlot: Slot, warn "Сould not rewind further than the last finalized epoch", finalized_slot = safeSlot, fail_slot = failSlot, finalized_epoch = finalizedEpoch, fail_epoch = failEpoch, - finalized_epoch = finalizedEpoch + finalized_epoch = finalizedEpoch, direction = sq.kind, + topics = "syncman" 0'u64 else: 1'u64 @@ -438,7 +447,8 @@ proc getRewindPoint*[T](sq: SyncQueue[T], failSlot: Slot, warn "Unable to continue syncing, please restart the node", finalized_slot = safeSlot, fail_slot = failSlot, finalized_epoch = finalizedEpoch, fail_epoch = failEpoch, - finalized_epoch = finalizedEpoch + finalized_epoch = finalizedEpoch, direction = sq.kind, + topics = "syncman" # Calculate the rewind epoch, which will be equal to last rewind point or # finalizedEpoch let rewindEpoch = @@ -459,7 +469,8 @@ proc getRewindPoint*[T](sq: SyncQueue[T], failSlot: Slot, # latest stored block. if failSlot == safeSlot: warn "Unable to continue syncing, please restart the node", - safe_slot = safeSlot, fail_slot = failSlot + safe_slot = safeSlot, fail_slot = failSlot, direction = sq.kind, + topics = "syncman" safeSlot iterator blocks*[T](sq: SyncQueue[T], @@ -491,7 +502,7 @@ proc notInRange[T](sq: SyncQueue[T], sr: SyncRequest[T]): bool = of SyncQueueKind.Forward: (sq.queueSize > 0) and (sr.slot != sq.outSlot) of SyncQueueKind.Backward: - (sq.queueSize > 0) and (sr.slot + sr.count != sq.outSlot) + (sq.queueSize > 0) and (sr.slot + sr.count - 1'u64 != sq.outSlot) proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], data: seq[ForkedSignedBeaconBlock], @@ -532,7 +543,7 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], some(sq.readyQueue.pop()) of SyncQueueKind.Backward: let maxSlot = sq.readyQueue[0].request.slot + - sq.readyQueue[0].request.count + (sq.readyQueue[0].request.count - 1'u64) if sq.outSlot != maxSlot: none[SyncResult[T]]() else: @@ -552,7 +563,7 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], blocks_count = len(sq.readyQueue[0].data), output_slot = sq.outSlot, input_slot = sq.inpSlot, peer = sq.readyQueue[0].request.item, rewind_to_slot = rewindSlot, - topics = "syncman" + direction = sq.readyQueue[0].request.kind, topics = "syncman" await sq.resetWait(some(rewindSlot)) break @@ -594,7 +605,7 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], request_step = item.request.step, blocks_map = getShortMap(item.request, item.data), blocks_count = len(item.data), errCode = res.error, - topics = "syncman" + direction = item.request.kind, topics = "syncman" var resetSlot: Option[Slot] @@ -617,7 +628,8 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], finalized_slot = safeSlot, request_slot = req.slot, request_count = req.count, request_step = req.step, blocks_count = len(item.data), - blocks_map = getShortMap(req, item.data), topics = "syncman" + blocks_map = getShortMap(req, item.data), + direction = req.kind, topics = "syncman" resetSlot = some(rewindSlot) req.item.updateScore(PeerScoreMissingBlocks) else: @@ -625,7 +637,8 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], peer = req.item, to_slot = safeSlot, request_slot = req.slot, request_count = req.count, request_step = req.step, blocks_count = len(item.data), - blocks_map = getShortMap(req, item.data), topics = "syncman" + blocks_map = getShortMap(req, item.data), + direction = req.kind, topics = "syncman" req.item.updateScore(PeerScoreBadBlocks) of SyncQueueKind.Backward: if safeSlot > req.slot: @@ -637,7 +650,8 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], finalized_slot = safeSlot, request_slot = req.slot, request_count = req.count, request_step = req.step, blocks_count = len(item.data), - blocks_map = getShortMap(req, item.data), topics = "syncman" + blocks_map = getShortMap(req, item.data), + direction = req.kind, topics = "syncman" resetSlot = some(rewindSlot) req.item.updateScore(PeerScoreMissingBlocks) else: @@ -645,14 +659,16 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], peer = req.item, to_slot = safeSlot, request_slot = req.slot, request_count = req.count, request_step = req.step, blocks_count = len(item.data), - blocks_map = getShortMap(req, item.data), topics = "syncman" + blocks_map = getShortMap(req, item.data), + direction = req.kind, topics = "syncman" req.item.updateScore(PeerScoreBadBlocks) of BlockError.Invalid: let req = item.request warn "Received invalid sequence of blocks", peer = req.item, request_slot = req.slot, request_count = req.count, request_step = req.step, blocks_count = len(item.data), - blocks_map = getShortMap(req, item.data), topics = "syncman" + blocks_map = getShortMap(req, item.data), + direction = req.kind, topics = "syncman" req.item.updateScore(PeerScoreBadBlocks) of BlockError.Duplicate, BlockError.UnviableFork: raiseAssert "Handled above" @@ -667,11 +683,11 @@ proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T], queue_input_slot = sq.inpSlot, queue_output_slot = sq.outSlot, rewind_epoch_count = sq.rewind.get().epochCount, rewind_fail_slot = sq.rewind.get().failSlot, - reset_slot = resetSlot, topics = "syncman" + reset_slot = resetSlot, direction = sq.kind, topics = "syncman" of SyncQueueKind.Backward: debug "Rewind to slot was happened", reset_slot = reset_slot.get(), queue_input_slot = sq.inpSlot, queue_output_slot = sq.outSlot, - reset_slot = resetSlot, topics = "syncman" + reset_slot = resetSlot, direction = sq.kind, topics = "syncman" break proc push*[T](sq: SyncQueue[T], sr: SyncRequest[T]) = @@ -717,11 +733,11 @@ proc pop*[T](sq: SyncQueue[T], maxslot: Slot, item: T): SyncRequest[T] = of SyncQueueKind.Backward: if sq.inpSlot == 0xFFFF_FFFF_FFFF_FFFF'u64: return SyncRequest.empty(sq.kind, T) - if sq.inpSlot <= sq.finalSlot: + if sq.inpSlot < sq.finalSlot: return SyncRequest.empty(sq.kind, T) let (slot, count) = block: - let baseSlot = sq.inpSlot + let baseSlot = sq.inpSlot + 1'u64 if baseSlot - sq.finalSlot < sq.chunkSize: let count = uint64(baseSlot - sq.finalSlot) (baseSlot - count, count) From 34d19b1f27c0691e78b2452030e4a8e8026970fe Mon Sep 17 00:00:00 2001 From: cheatfate Date: Thu, 20 Jan 2022 01:30:24 +0200 Subject: [PATCH 3/4] Recover progress pivot. --- beacon_chain/nimbus_beacon_node.nim | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/beacon_chain/nimbus_beacon_node.nim b/beacon_chain/nimbus_beacon_node.nim index f5c3ab53f7..fd059619fb 100644 --- a/beacon_chain/nimbus_beacon_node.nim +++ b/beacon_chain/nimbus_beacon_node.nim @@ -467,12 +467,13 @@ proc init*(T: type BeaconNode, validatorPool, syncCommitteeMsgPool, quarantine, rng, getBeaconTime, taskpool) syncManager = newSyncManager[Peer, PeerID]( - network.peerPool, SyncQueueKind.Forward, getLocalHeadSlot, getLocalWallSlot, - getFirstSlotAtFinalizedEpoch, getBackfillSlot, blockVerifier) + network.peerPool, SyncQueueKind.Forward, getLocalHeadSlot, + getLocalWallSlot, getFirstSlotAtFinalizedEpoch, getBackfillSlot, + dag.tail.slot, blockVerifier) backfiller = newSyncManager[Peer, PeerID]( - network.peerPool, SyncQueueKind.Backward, getLocalHeadSlot, getLocalWallSlot, - getFirstSlotAtFinalizedEpoch, getBackfillSlot, blockVerifier, - maxHeadAge = 0) + network.peerPool, SyncQueueKind.Backward, getLocalHeadSlot, + getLocalWallSlot, getFirstSlotAtFinalizedEpoch, getBackfillSlot, + blockVerifier, dag.tail.slot, maxHeadAge = 0) let stateTtlCache = if config.restCacheSize > 0: StateTtlCache.init( From 59aab003c71df3f6780728f6b63550b68330cc5b Mon Sep 17 00:00:00 2001 From: cheatfate Date: Thu, 20 Jan 2022 07:48:00 +0200 Subject: [PATCH 4/4] Fix proper pivot place. --- beacon_chain/nimbus_beacon_node.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_chain/nimbus_beacon_node.nim b/beacon_chain/nimbus_beacon_node.nim index fd059619fb..9ad2babdb4 100644 --- a/beacon_chain/nimbus_beacon_node.nim +++ b/beacon_chain/nimbus_beacon_node.nim @@ -473,7 +473,7 @@ proc init*(T: type BeaconNode, backfiller = newSyncManager[Peer, PeerID]( network.peerPool, SyncQueueKind.Backward, getLocalHeadSlot, getLocalWallSlot, getFirstSlotAtFinalizedEpoch, getBackfillSlot, - blockVerifier, dag.tail.slot, maxHeadAge = 0) + dag.backfill.slot, blockVerifier, maxHeadAge = 0) let stateTtlCache = if config.restCacheSize > 0: StateTtlCache.init(