Skip to content

Commit 9349cab

Browse files
committed
<fix>[compute]: cancel backup longjobs before migration
When UI sends backupTaskLongJobUuids in MigrateVmLongJob jobData, cancel those backup longjobs and wait for volume chain tasks to exit before starting migration. Includes element-level validation for deserialized UUID list. Resolves: ZSTAC-82195 Change-Id: If4203d967b23568e7cd09fb1ecc95ae653e137d9
1 parent 327b1cc commit 9349cab

1 file changed

Lines changed: 150 additions & 7 deletions

File tree

compute/src/main/java/org/zstack/compute/vm/MigrateVmLongJob.java

Lines changed: 150 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,34 @@
11
package org.zstack.compute.vm;
22

3+
import org.apache.logging.log4j.LogManager;
4+
import org.apache.logging.log4j.Logger;
35
import org.apache.logging.log4j.ThreadContext;
46
import org.springframework.beans.factory.annotation.Autowire;
57
import org.springframework.beans.factory.annotation.Autowired;
68
import org.springframework.beans.factory.annotation.Configurable;
7-
import org.zstack.core.Platform;
89
import org.zstack.core.cloudbus.CloudBus;
910
import org.zstack.core.cloudbus.CloudBusCallBack;
1011
import org.zstack.core.db.DatabaseFacade;
12+
import org.zstack.core.db.Q;
13+
import org.zstack.core.thread.ThreadFacade;
1114
import org.zstack.header.Constants;
1215
import org.zstack.header.core.Completion;
1316
import org.zstack.header.core.ReturnValueCompletion;
14-
import org.zstack.header.longjob.LongJobErrors;
15-
import org.zstack.header.longjob.LongJobFor;
16-
import org.zstack.header.longjob.LongJobVO;
17+
import org.zstack.header.errorcode.ErrorCode;
18+
import org.zstack.header.longjob.*;
1719
import org.zstack.header.message.APIEvent;
1820
import org.zstack.header.message.MessageReply;
1921
import org.zstack.header.vm.*;
20-
import org.zstack.header.longjob.LongJob;
21-
import org.zstack.longjob.LongJobUtils;
22+
import org.zstack.header.volume.GetVolumeTaskMsg;
23+
import org.zstack.header.volume.GetVolumeTaskReply;
24+
import org.zstack.header.volume.VolumeConstant;
25+
import org.zstack.header.volume.VolumeVO;
26+
import org.zstack.header.volume.VolumeVO_;
2227
import org.zstack.utils.gson.JSONObjectUtil;
2328

24-
import static org.zstack.core.Platform.err;
29+
import java.util.*;
30+
import java.util.concurrent.TimeUnit;
31+
2532
import static org.zstack.core.Platform.operr;
2633

2734

@@ -31,16 +38,152 @@
3138
@LongJobFor(APIMigrateVmMsg.class)
3239
@Configurable(preConstruction = true, autowire = Autowire.BY_TYPE)
3340
public class MigrateVmLongJob implements LongJob {
41+
private static final Logger logger = LogManager.getLogger(MigrateVmLongJob.class);
42+
private static final int WAIT_CHAIN_TASK_EXIT_MAX_RETRIES = 30;
43+
private static final long WAIT_CHAIN_TASK_EXIT_INTERVAL_SECS = 1;
44+
3445
@Autowired
3546
protected CloudBus bus;
3647
@Autowired
3748
protected DatabaseFacade dbf;
49+
@Autowired
50+
private ThreadFacade thdf;
3851

3952
protected String auditResourceUuid;
4053

4154
@Override
4255
public void start(LongJobVO job, ReturnValueCompletion<APIEvent> completion) {
4356
MigrateVmInnerMsg msg = JSONObjectUtil.toObject(job.getJobData(), MigrateVmInnerMsg.class);
57+
58+
List<String> backupTaskLongJobUuids = getBackupTaskLongJobUuids(job.getJobData());
59+
if (backupTaskLongJobUuids != null && !backupTaskLongJobUuids.isEmpty()) {
60+
logger.info(String.format("migrate vm[uuid:%s] longjob has %d backup longjobs to cancel first",
61+
msg.getVmInstanceUuid(), backupTaskLongJobUuids.size()));
62+
cancelBackupLongJobsThenMigrate(backupTaskLongJobUuids, msg, completion);
63+
} else {
64+
doMigrate(msg, completion);
65+
}
66+
}
67+
68+
private List<String> getBackupTaskLongJobUuids(String jobData) {
69+
Map<String, Object> raw = JSONObjectUtil.toObject(jobData, LinkedHashMap.class);
70+
Object uuids = raw == null ? null : raw.get("backupTaskLongJobUuids");
71+
if (!(uuids instanceof List<?>)) {
72+
return null;
73+
}
74+
75+
List<String> result = new ArrayList<>();
76+
for (Object item : (List<?>) uuids) {
77+
if (item == null) {
78+
continue;
79+
}
80+
String uuid = String.valueOf(item).trim();
81+
if (!uuid.isEmpty()) {
82+
result.add(uuid);
83+
}
84+
}
85+
return result.isEmpty() ? null : result;
86+
}
87+
88+
private void cancelBackupLongJobsThenMigrate(List<String> backupTaskLongJobUuids,
89+
MigrateVmInnerMsg msg,
90+
ReturnValueCompletion<APIEvent> completion) {
91+
cancelBackupLongJobs(backupTaskLongJobUuids.iterator(), new Completion(completion) {
92+
@Override
93+
public void success() {
94+
waitForVolumeChainTasksExit(msg.getVmInstanceUuid(), WAIT_CHAIN_TASK_EXIT_MAX_RETRIES,
95+
new Completion(completion) {
96+
@Override
97+
public void success() {
98+
doMigrate(msg, completion);
99+
}
100+
101+
@Override
102+
public void fail(ErrorCode errorCode) {
103+
completion.fail(errorCode);
104+
}
105+
});
106+
}
107+
108+
@Override
109+
public void fail(ErrorCode errorCode) {
110+
logger.warn(String.format("failed to cancel backup longjobs for vm[uuid:%s], " +
111+
"attempting migration anyway: %s", msg.getVmInstanceUuid(), errorCode));
112+
doMigrate(msg, completion);
113+
}
114+
});
115+
}
116+
117+
private void cancelBackupLongJobs(Iterator<String> it, Completion completion) {
118+
if (!it.hasNext()) {
119+
completion.success();
120+
return;
121+
}
122+
123+
String longJobUuid = it.next();
124+
CancelLongJobMsg cmsg = new CancelLongJobMsg();
125+
cmsg.setUuid(longJobUuid);
126+
bus.makeLocalServiceId(cmsg, LongJobConstants.SERVICE_ID);
127+
bus.send(cmsg, new CloudBusCallBack(completion) {
128+
@Override
129+
public void run(MessageReply reply) {
130+
if (!reply.isSuccess()) {
131+
logger.warn(String.format("failed to cancel backup longjob[uuid:%s]: %s",
132+
longJobUuid, reply.getError()));
133+
}
134+
cancelBackupLongJobs(it, completion);
135+
}
136+
});
137+
}
138+
139+
private void waitForVolumeChainTasksExit(String vmUuid, int retriesLeft, Completion completion) {
140+
List<String> volUuids = Q.New(VolumeVO.class)
141+
.eq(VolumeVO_.vmInstanceUuid, vmUuid)
142+
.select(VolumeVO_.uuid)
143+
.listValues();
144+
145+
if (volUuids.isEmpty()) {
146+
completion.success();
147+
return;
148+
}
149+
150+
GetVolumeTaskMsg gmsg = new GetVolumeTaskMsg();
151+
gmsg.setVolumeUuids(volUuids);
152+
bus.makeLocalServiceId(gmsg, VolumeConstant.SERVICE_ID);
153+
bus.send(gmsg, new CloudBusCallBack(completion) {
154+
@Override
155+
public void run(MessageReply reply) {
156+
if (!reply.isSuccess()) {
157+
completion.fail(reply.getError());
158+
return;
159+
}
160+
161+
GetVolumeTaskReply gr = reply.castReply();
162+
boolean hasRunningTasks = gr.getResults().values().stream()
163+
.anyMatch(info -> !info.getRunningTask().isEmpty());
164+
165+
if (!hasRunningTasks) {
166+
completion.success();
167+
return;
168+
}
169+
170+
if (retriesLeft <= 0) {
171+
completion.fail(operr(
172+
"timeout waiting for volume backup chain tasks to exit for vm[uuid:%s]", vmUuid));
173+
return;
174+
}
175+
176+
logger.debug(String.format(
177+
"volumes of vm[uuid:%s] still have running tasks, retry in %ds (retries left: %d)",
178+
vmUuid, WAIT_CHAIN_TASK_EXIT_INTERVAL_SECS, retriesLeft));
179+
thdf.submitTimeoutTask(
180+
() -> waitForVolumeChainTasksExit(vmUuid, retriesLeft - 1, completion),
181+
TimeUnit.SECONDS, WAIT_CHAIN_TASK_EXIT_INTERVAL_SECS);
182+
}
183+
});
184+
}
185+
186+
private void doMigrate(MigrateVmInnerMsg msg, ReturnValueCompletion<APIEvent> completion) {
44187
bus.makeTargetServiceIdByResourceUuid(msg, VmInstanceConstant.SERVICE_ID, msg.getVmInstanceUuid());
45188
bus.send(msg, new CloudBusCallBack(completion) {
46189
@Override

0 commit comments

Comments
 (0)