Skip to content

Commit 49a4f08

Browse files
shihuili1218yuanyuan.liu
andauthored
fix: flaky ElectSelfPersistOrderTest (#1262)
* fix: flaky ElectSelfPersistOrderTest * fmt * setMaxElectionDelayMs * data race * data race * waitForLeader --------- Co-authored-by: yuanyuan.liu <yuanyuan.liu@signalplus.com>
1 parent 913a85a commit 49a4f08

1 file changed

Lines changed: 50 additions & 19 deletions

File tree

jraft-core/src/test/java/com/alipay/sofa/jraft/core/ElectSelfPersistOrderTest.java

Lines changed: 50 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,26 @@ public void teardown() throws Exception {
8383

8484
/**
8585
* Verify that when setTermAndVotedFor() fails during electSelf(), the node
86-
* steps down and does NOT become leader.
86+
* steps down and does NOT send RequestVote RPCs to peers.
8787
*
8888
* Approach:
8989
* 1. Start a 3-node cluster with a FailingMetaStorage on node 0
90-
* 2. Wait for leader election to complete normally (node 0 must not be leader)
91-
* 3. Enable the failure flag on node 0's meta storage
92-
* 4. Directly trigger electSelf() on node 0 via tryElectSelf()
93-
* 5. Verify: persist failure detected, node 0 did NOT become leader
90+
* 2. Wait for leader election to complete normally (fail flag is off)
91+
* 3. Ensure node 0 is NOT the leader
92+
* 4. Enable the failure flag on node 0's meta storage
93+
* 5. Directly trigger electSelf() on node 0 via tryElectSelf()
94+
* 6. electSelf() increments term, calls setTermAndVotedFor() which fails,
95+
* then steps down WITHOUT sending any RequestVote RPCs
96+
* 7. Verify the observer node's term did NOT advance (proving no RPC was sent)
9497
*
95-
* Using tryElectSelf() directly avoids the need to kill the leader (whose
96-
* graceful shutdown sends TimeoutNow, independently bumping follower terms).
98+
* Using tryElectSelf() instead of killing the leader avoids two timing races:
99+
* (a) Leader shutdown sends TimeoutNow to a follower via stepDown(wakeupCandidate=true),
100+
* which triggers immediate electSelf() on the receiver — bypassing preVote entirely.
101+
* If TimeoutNow reaches the observer (~50% chance), the observer bumps its term
102+
* independently, causing a false test failure.
103+
* (b) After the leader dies the observer's own election timer (~10s) may fire at roughly
104+
* the same time node 0's preVote finally gets approved (also ~10s, once the leader
105+
* lease expires), creating a narrow but real race window.
97106
*/
98107
@Test
99108
public void testPersistFailurePreventsRpcSend() throws Exception {
@@ -156,25 +165,47 @@ public void onApply(final Iterator iter) {
156165
assumeTrue("Could not transfer leadership away from node0", leader != null && leader != node0);
157166
}
158167

168+
assertNotNull("Should still have a leader", leader);
169+
final PeerId leaderPeer = leader.getLeaderId();
170+
final int leaderIdx = peers.indexOf(leaderPeer);
171+
172+
// Find the observer (not node0, not leader)
173+
Node observer = null;
174+
for (int i = 0; i < nodes.size(); i++) {
175+
if (i != 0 && i != leaderIdx) {
176+
observer = nodes.get(i);
177+
break;
178+
}
179+
}
180+
assertNotNull("Should have an observer", observer);
181+
182+
final long observerTermBefore = ((NodeImpl) observer).getCurrentTerm();
183+
159184
// Enable fail flag: next setTermAndVotedFor on node0 returns false
160185
failFlag.set(true);
161186

162-
// Directly trigger electSelf() on node0. With the fix, electSelf()
163-
// calls setTermAndVotedFor() BEFORE sending RequestVote RPCs. When
164-
// it returns false, electSelf() steps down without sending any RPCs.
165-
// Without the fix, RPCs would be sent first and node0 could become
166-
// leader despite the persistence failure (since quorum = 2 and the
167-
// observer would grant the vote).
187+
// Directly trigger electSelf() on node0. This is synchronous:
188+
// electSelf() will increment term, call setTermAndVotedFor() which
189+
// fails, then stepDown and return — all without sending RequestVote
190+
// RPCs. No need to kill the leader and wait for timeout-driven elections.
168191
((NodeImpl) node0).tryElectSelf();
169192

170-
// Persist failure should have been detected
171-
assertTrue("Node0 should have hit persist failure", failLatch.await(5, TimeUnit.SECONDS));
172-
assertTrue("Failed term should be positive", failedAtTerm.get() > 0);
193+
// failLatch was counted down synchronously inside setTermAndVotedFor
194+
assertTrue("Node0 should have attempted election and hit persist failure",
195+
failLatch.await(1, TimeUnit.SECONDS));
196+
Thread.sleep(500); // Wait for any async RPCs to be delivered and processed.
197+
final long observerTermAfter = ((NodeImpl) observer).getCurrentTerm();
198+
199+
System.out.println("Observer term before=" + observerTermBefore + ", after=" + observerTermAfter
200+
+ ", node0 failed at term=" + failedAtTerm.get());
173201

174-
System.out.println("Node0 persist failed at term=" + failedAtTerm.get());
202+
// With the fix: setTermAndVotedFor is called BEFORE the RPC loop.
203+
// When it returns false, electSelf() steps down and returns without
204+
// sending any RequestVote RPCs. Therefore the observer's term should
205+
// NOT have been bumped by node0's failed election attempt.
206+
assertEquals("Observer term should not advance (no RequestVote RPC sent)", observerTermBefore,
207+
observerTermAfter);
175208

176-
// Node0 must NOT be leader — proves electSelf() did not proceed
177-
// past the failed setTermAndVotedFor() to send RPCs and win votes.
178209
assertFalse("Node0 should not be leader after persist failure", ((NodeImpl) node0).isLeader());
179210

180211
} finally {

0 commit comments

Comments
 (0)