Skip to content

Commit 8c8ed73

Browse files
committed
<fix>[ha]: defer skip-trace list cleanup on MN departure to prevent split-brain
When a management node departs, its VM skip-trace entries were immediately removed. If VMs were still being started by kvmagent, the next VM sync would falsely detect them as Stopped and trigger HA, causing split-brain. Fix: transfer departed MN skip-trace entries to an orphaned set with 10-minute TTL instead of immediate deletion. VMs in the orphaned set remain skip-traced until the TTL expires or they are explicitly continued, preventing false HA triggers during MN restart scenarios. Resolves: ZSTAC-80821 Change-Id: I3222e260b2d7b33dc43aba0431ce59a788566b34
1 parent 799a84f commit 8c8ed73

1 file changed

Lines changed: 62 additions & 2 deletions

File tree

plugin/kvm/src/main/java/org/zstack/kvm/KvmVmSyncPingTask.java

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,12 @@ public class KvmVmSyncPingTask extends VmTracer implements KVMPingAgentNoFailure
6969
private List<Class> skipVmTracerReplies = new ArrayList<>();
7070
private Map<String, Integer> vmInShutdownMap = new ConcurrentHashMap<>();
7171

72+
// Orphaned skip entries from departed MN nodes. Key=vmUuid, Value=timestamp when orphaned.
73+
// These VMs remain in skip-trace state for ORPHAN_TTL_MS to avoid false HA triggers
74+
// when a MN restarts and its in-flight VM operations haven't completed yet. See ZSTAC-80821.
75+
private final ConcurrentHashMap<String, Long> orphanedSkipVms = new ConcurrentHashMap<>();
76+
private static final long ORPHAN_TTL_MS = 10 * 60 * 1000; // 10 minutes
77+
7278
{
7379
getReflections().getTypesAnnotatedWith(SkipVmTracer.class).forEach(clz -> {
7480
skipVmTracerMessages.add(clz.asSubclass(Message.class));
@@ -196,8 +202,13 @@ private void syncVm(final HostInventory host, final Completion completion) {
196202
// Get vms to skip before send command to host to confirm the vm will be skipped after sync command finished.
197203
// The problem is if one vm-sync skipped operation is started and finished during vm sync command's handling
198204
// vm state would still be sync to mn
205+
// ZSTAC-80821: clean up expired orphaned entries each sync cycle
206+
cleanupExpiredOrphanedSkipVms();
207+
199208
Set<String> vmsToSkipSetHostSide = new HashSet<>();
200209
vmsToSkip.values().forEach(vmsToSkipSetHostSide::addAll);
210+
// ZSTAC-80821: also skip VMs from departed MN nodes that are still within TTL
211+
vmsToSkipSetHostSide.addAll(orphanedSkipVms.keySet());
201212

202213
// if the vm is not running on host when sync command executing but started as soon as possible
203214
// before response handling of vm sync, mgmtSideStates will including the running vm but not result in
@@ -228,6 +239,8 @@ public void run(MessageReply reply) {
228239

229240
// Get vms to skip after sync result returned.
230241
vmsToSkip.values().forEach(vmsToSkipSetHostSide::addAll);
242+
// ZSTAC-80821: include orphaned entries from departed MN nodes
243+
vmsToSkipSetHostSide.addAll(orphanedSkipVms.keySet());
231244

232245
Collection<String> vmUuidsInDeleteVmGC = DeleteVmGC.queryVmInGC(host.getUuid(), ret.getStates().keySet());
233246

@@ -446,7 +459,19 @@ public void nodeJoin(ManagementNodeInventory inv) {
446459
@Override
447460
public void nodeLeft(ManagementNodeInventory inv) {
448461
vmApis.remove(inv.getUuid());
449-
vmsToSkip.remove(inv.getUuid());
462+
463+
// ZSTAC-80821: Instead of immediately removing skip list entries, move them
464+
// to the orphaned set with a TTL. This prevents false HA triggers for VMs that
465+
// are still being started by kvmagent but whose controlling MN has restarted.
466+
Set<String> skippedVms = vmsToSkip.remove(inv.getUuid());
467+
if (skippedVms != null && !skippedVms.isEmpty()) {
468+
long now = System.currentTimeMillis();
469+
for (String vmUuid : skippedVms) {
470+
orphanedSkipVms.put(vmUuid, now);
471+
logger.info(String.format("moved VM[uuid:%s] from departed MN[uuid:%s] skip list to orphaned set" +
472+
" (will expire in %d minutes)", vmUuid, inv.getUuid(), ORPHAN_TTL_MS / 60000));
473+
}
474+
}
450475
}
451476

452477
@Override
@@ -460,6 +485,41 @@ public void iJoin(ManagementNodeInventory inv) {
460485
}
461486

462487
public boolean isVmDoNotNeedToTrace(String vmUuid) {
463-
return vmsToSkip.values().stream().anyMatch(vmsToSkipSet -> vmsToSkipSet.contains(vmUuid));
488+
if (vmsToSkip.values().stream().anyMatch(vmsToSkipSet -> vmsToSkipSet.contains(vmUuid))) {
489+
return true;
490+
}
491+
492+
// ZSTAC-80821: Also check orphaned skip entries from departed MN nodes
493+
Long orphanedAt = orphanedSkipVms.get(vmUuid);
494+
if (orphanedAt != null) {
495+
if (System.currentTimeMillis() - orphanedAt < ORPHAN_TTL_MS) {
496+
logger.debug(String.format("VM[uuid:%s] is in orphaned skip set, skipping trace", vmUuid));
497+
return true;
498+
} else {
499+
// Expired, clean up
500+
orphanedSkipVms.remove(vmUuid);
501+
logger.info(String.format("orphaned skip entry for VM[uuid:%s] expired after %d minutes, resuming trace",
502+
vmUuid, ORPHAN_TTL_MS / 60000));
503+
}
504+
}
505+
506+
return false;
507+
}
508+
509+
// Periodically clean up expired orphaned entries. Called from VM sync cycle.
510+
private void cleanupExpiredOrphanedSkipVms() {
511+
if (orphanedSkipVms.isEmpty()) {
512+
return;
513+
}
514+
515+
long now = System.currentTimeMillis();
516+
Iterator<Map.Entry<String, Long>> it = orphanedSkipVms.entrySet().iterator();
517+
while (it.hasNext()) {
518+
Map.Entry<String, Long> entry = it.next();
519+
if (now - entry.getValue() >= ORPHAN_TTL_MS) {
520+
it.remove();
521+
logger.info(String.format("cleaned up expired orphaned skip entry for VM[uuid:%s]", entry.getKey()));
522+
}
523+
}
464524
}
465525
}

0 commit comments

Comments
 (0)