etcd-io · ahrtr · Jan 10, 2024 · Nov 17, 2023 · Nov 17, 2023 · pav-kv
diff --git a/raft.go b/raft.go
@@ -1478,10 +1478,13 @@ func stepLeader(r *raft, m pb.Message) error {
 				switch {
 				case pr.State == tracker.StateProbe:
 					pr.BecomeReplicate()
-				case pr.State == tracker.StateSnapshot && pr.Match >= pr.PendingSnapshot:
-					// TODO(tbg): we should also enter this branch if a snapshot is
-					// received that is below pr.PendingSnapshot but which makes it
-					// possible to use the log again.
+				case pr.State == tracker.StateSnapshot && pr.Match+1 >= r.raftLog.firstIndex():
+					// Note that we don't take into account PendingSnapshot to
+					// enter this branch. No matter at which index a snapshot
+					// was actually applied, as long as this allows catching up
+					// the follower from the log, we will accept it. This gives
+					// systems more flexibility in how they implement snapshots;
+					// see the comments on PendingSnapshot.
 					r.logger.Debugf("%x recovered from needing snapshot, resumed sending replication messages to %x [%s]", r.id, m.From, pr)
 					// Transition back to replicating state via probing state
 					// (which takes the snapshot into account). If we didn't
@@ -1560,10 +1563,6 @@ func stepLeader(r *raft, m pb.Message) error {
 		if pr.State != tracker.StateSnapshot {
 			return nil
 		}
-		// TODO(tbg): this code is very similar to the snapshot handling in
-		// MsgAppResp above. In fact, the code there is more correct than the
-		// code here and should likely be updated to match (or even better, the
-		// logic pulled into a newly created Progress state machine handler).
 		if !m.Reject {
 			pr.BecomeProbe()
 			r.logger.Debugf("%x snapshot succeeded, resumed sending replication messages to %x [%s]", r.id, m.From, pr)

diff --git a/testdata/snapshot_succeed_via_app_resp_behind.txt b/testdata/snapshot_succeed_via_app_resp_behind.txt
@@ -1,8 +1,5 @@
 # This is a variant of snapshot_succeed_via_app_resp in which the snapshot
 # that is being sent is behind the PendingSnapshot index tracked by the leader.
-#
-# In this case, the leader keeps the follower in StateSnapshot even though it
-# could move it back to StateReplicate.
 
 # Turn off output during the setup of the test.
 log-level none
@@ -151,11 +148,16 @@ stabilize 1
 ----
 > 1 receiving messages
   3->1 MsgAppResp Term:1 Log:0/11
+  DEBUG 1 recovered from needing snapshot, resumed sending replication messages to 3 [StateSnapshot match=11 next=12 paused pendingSnap=12]
+> 1 handling Ready
+  Ready MustSync=false:
+  Messages:
+  1->3 MsgApp Term:1 Log:1/11 Commit:12 Entries:[1/12 EntryNormal "\"foo\""]
 
-# 3 is still pending a snapshot, even though its match is 11 which
-# the leader could catch the follower up from.
+# 3 is in StateReplicate thanks to receiving the snapshot at index 11.
+# This is despite its PendingSnapshot having been 12.
 status 1
 ----
 1: StateReplicate match=12 next=13
 2: StateReplicate match=12 next=13
-3: StateSnapshot match=11 next=12 paused pendingSnap=12
+3: StateReplicate match=11 next=13 inflight=1
diff --git a/tracker/progress.go b/tracker/progress.go
@@ -42,11 +42,27 @@ type Progress struct {
 	// before and stops sending any replication message.
 	State StateType
 
-	// PendingSnapshot is used in StateSnapshot.
-	// If there is a pending snapshot, the pendingSnapshot will be set to the
-	// index of the snapshot. If pendingSnapshot is set, the replication process of
-	// this Progress will be paused. raft will not resend snapshot until the pending one
-	// is reported to be failed.
+	// PendingSnapshot is used in StateSnapshot and tracks the last index of the
+	// leader at the time at which it realized a snapshot was necessary. This
+	// matches the index in the MsgSnap message emitted from raft.
+	//
+	// While there is a pending snapshot, replication to the follower is paused.
+	// The follower will transition back to StateReplicate if the leader
+	// receives an MsgAppResp from it that reconnects the follower to the
+	// leader's log (such an MsgAppResp is emitted when the follower applies a
+	// snapshot). It may be surprising that PendingSnapshot is not taken into
+	// account here, but consider that complex systems may delegate the sending
+	// of snapshots to alternative datasources (i.e. not the leader). In such
+	// setups, it is difficult to manufacture a snapshot at a particular index
+	// requested by raft and the actual index may be ahead or behind. This
+	// should be okay, as long as the snapshot allows replication to resume.
+	//
+	// The follower will transition to StateProbe if ReportSnapshot is called on
+	// the leader; if SnapshotFinish is passed then PendingSnapshot becomes the
+	// basis for the next attempt to append. In practice, the first mechanism is
+	// the one that is relevant in most cases. However, if this MsgAppResp is
+	// lost (fallible network) then the second mechanism ensures that in this
+	// case the follower does not erroneously remain in StateSnapshot.
 	PendingSnapshot uint64
 
 	// RecentActive is true if the progress is recently active. Receiving any messages