@@ -83,17 +83,26 @@ public void teardown() throws Exception {
8383
8484 /**
8585 * Verify that when setTermAndVotedFor() fails during electSelf(), the node
86- * steps down and does NOT become leader .
86+ * steps down and does NOT send RequestVote RPCs to peers .
8787 *
8888 * Approach:
8989 * 1. Start a 3-node cluster with a FailingMetaStorage on node 0
90- * 2. Wait for leader election to complete normally (node 0 must not be leader)
91- * 3. Enable the failure flag on node 0's meta storage
92- * 4. Directly trigger electSelf() on node 0 via tryElectSelf()
93- * 5. Verify: persist failure detected, node 0 did NOT become leader
90+ * 2. Wait for leader election to complete normally (fail flag is off)
91+ * 3. Ensure node 0 is NOT the leader
92+ * 4. Enable the failure flag on node 0's meta storage
93+ * 5. Directly trigger electSelf() on node 0 via tryElectSelf()
94+ * 6. electSelf() increments term, calls setTermAndVotedFor() which fails,
95+ * then steps down WITHOUT sending any RequestVote RPCs
96+ * 7. Verify the observer node's term did NOT advance (proving no RPC was sent)
9497 *
95- * Using tryElectSelf() directly avoids the need to kill the leader (whose
96- * graceful shutdown sends TimeoutNow, independently bumping follower terms).
98+ * Using tryElectSelf() instead of killing the leader avoids two timing races:
99+ * (a) Leader shutdown sends TimeoutNow to a follower via stepDown(wakeupCandidate=true),
100+ * which triggers immediate electSelf() on the receiver — bypassing preVote entirely.
101+ * If TimeoutNow reaches the observer (~50% chance), the observer bumps its term
102+ * independently, causing a false test failure.
103+ * (b) After the leader dies the observer's own election timer (~10s) may fire at roughly
104+ * the same time node 0's preVote finally gets approved (also ~10s, once the leader
105+ * lease expires), creating a narrow but real race window.
97106 */
98107 @ Test
99108 public void testPersistFailurePreventsRpcSend () throws Exception {
@@ -156,25 +165,47 @@ public void onApply(final Iterator iter) {
156165 assumeTrue ("Could not transfer leadership away from node0" , leader != null && leader != node0 );
157166 }
158167
168+ assertNotNull ("Should still have a leader" , leader );
169+ final PeerId leaderPeer = leader .getLeaderId ();
170+ final int leaderIdx = peers .indexOf (leaderPeer );
171+
172+ // Find the observer (not node0, not leader)
173+ Node observer = null ;
174+ for (int i = 0 ; i < nodes .size (); i ++) {
175+ if (i != 0 && i != leaderIdx ) {
176+ observer = nodes .get (i );
177+ break ;
178+ }
179+ }
180+ assertNotNull ("Should have an observer" , observer );
181+
182+ final long observerTermBefore = ((NodeImpl ) observer ).getCurrentTerm ();
183+
159184 // Enable fail flag: next setTermAndVotedFor on node0 returns false
160185 failFlag .set (true );
161186
162- // Directly trigger electSelf() on node0. With the fix, electSelf()
163- // calls setTermAndVotedFor() BEFORE sending RequestVote RPCs. When
164- // it returns false, electSelf() steps down without sending any RPCs.
165- // Without the fix, RPCs would be sent first and node0 could become
166- // leader despite the persistence failure (since quorum = 2 and the
167- // observer would grant the vote).
187+ // Directly trigger electSelf() on node0. This is synchronous:
188+ // electSelf() will increment term, call setTermAndVotedFor() which
189+ // fails, then stepDown and return — all without sending RequestVote
190+ // RPCs. No need to kill the leader and wait for timeout-driven elections.
168191 ((NodeImpl ) node0 ).tryElectSelf ();
169192
170- // Persist failure should have been detected
171- assertTrue ("Node0 should have hit persist failure" , failLatch .await (5 , TimeUnit .SECONDS ));
172- assertTrue ("Failed term should be positive" , failedAtTerm .get () > 0 );
193+ // failLatch was counted down synchronously inside setTermAndVotedFor
194+ assertTrue ("Node0 should have attempted election and hit persist failure" ,
195+ failLatch .await (1 , TimeUnit .SECONDS ));
196+ Thread .sleep (500 ); // Wait for any async RPCs to be delivered and processed.
197+ final long observerTermAfter = ((NodeImpl ) observer ).getCurrentTerm ();
198+
199+ System .out .println ("Observer term before=" + observerTermBefore + ", after=" + observerTermAfter
200+ + ", node0 failed at term=" + failedAtTerm .get ());
173201
174- System .out .println ("Node0 persist failed at term=" + failedAtTerm .get ());
202+ // With the fix: setTermAndVotedFor is called BEFORE the RPC loop.
203+ // When it returns false, electSelf() steps down and returns without
204+ // sending any RequestVote RPCs. Therefore the observer's term should
205+ // NOT have been bumped by node0's failed election attempt.
206+ assertEquals ("Observer term should not advance (no RequestVote RPC sent)" , observerTermBefore ,
207+ observerTermAfter );
175208
176- // Node0 must NOT be leader — proves electSelf() did not proceed
177- // past the failed setTermAndVotedFor() to send RPCs and win votes.
178209 assertFalse ("Node0 should not be leader after persist failure" , ((NodeImpl ) node0 ).isLeader ());
179210
180211 } finally {
0 commit comments