Skip to content

Commit 46636ba

Browse files
committed
be gentler with terminating tasks which fail to reach STANDBY
If a task fails to report STANDBY state upon startup, we now perform the standard TERM, INT, KILL sequence instead of just KILL. We also rely on Wait() to close relevant pipes and provide us with exit code. All above is moved into a dedicated method which will be reused in next fixes.
1 parent 5d2bb0b commit 46636ba

1 file changed

Lines changed: 47 additions & 17 deletions

File tree

executor/executable/controllabletask.go

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -211,25 +211,14 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time.
211211

212212
err = t.pollTaskForStandbyState()
213213
if err != nil {
214-
t.sendStatus(t.knownEnvironmentId, mesos.TASK_FAILED, err.Error())
215-
216-
_ = t.rpc.Close()
217-
t.rpc = nil
218-
219-
pid := t.knownPid
220-
if pid == 0 {
221-
// The pid was never known through a successful `GetState` in the lifetime
222-
// of this process, so we must rely on the PGID of the containing shell
223-
pid = -taskCmd.Process.Pid
224-
}
225214
log.WithFields(defaultLogFields).
226-
Debug("sending SIGKILL (9) to task")
227-
_ = syscall.Kill(pid, syscall.SIGKILL) // fixme: not sure why we do it differently than elsewhere (doTermIntKill)
228-
_ = stdoutIn.Close()
229-
_ = stderrIn.Close()
215+
WithField(infologger.Level, infologger.IL_Support).
216+
WithError(err).
217+
Error("failed to poll task for standby state")
230218

231-
log.WithFields(defaultLogFields).
232-
Debug("task killed")
219+
t.sendStatus(t.knownEnvironmentId, mesos.TASK_FAILED, err.Error())
220+
221+
t.cleanupFailedTask(taskCmd)
233222
return
234223
}
235224

@@ -314,6 +303,47 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time.
314303
return
315304
}
316305

306+
func (t *ControllableTask) cleanupFailedTask(taskCmd *exec.Cmd) {
307+
308+
defaultLogFields := logrus.Fields{
309+
"taskId": t.ti.TaskID.GetValue(),
310+
"taskName": t.ti.Name,
311+
"partition": t.knownEnvironmentId.String(),
312+
"detector": t.knownDetector,
313+
}
314+
315+
if taskCmd.Process == nil {
316+
// task never started or was already terminated
317+
return
318+
}
319+
320+
if t.rpc != nil {
321+
_ = t.rpc.Close()
322+
t.rpc = nil
323+
}
324+
325+
pid := t.knownPid
326+
if pid == 0 {
327+
// The pid was never known through a successful `GetState` in the lifetime
328+
// of this process, so we must rely on the PGID of the containing shell
329+
pid = -taskCmd.Process.Pid
330+
}
331+
332+
_ = t.doTermIntKill(-taskCmd.Process.Pid)
333+
334+
err := taskCmd.Wait()
335+
if err != nil {
336+
log.WithFields(defaultLogFields).
337+
WithField(infologger.Level, infologger.IL_Support).
338+
WithError(err).
339+
Warning("task terminated and exited with error")
340+
} else {
341+
log.WithFields(defaultLogFields).
342+
WithField(infologger.Level, infologger.IL_Support).
343+
Debug("task terminated")
344+
}
345+
}
346+
317347
func (t *ControllableTask) initTaskStdLogging(stdoutIn io.ReadCloser, stderrIn io.ReadCloser) {
318348
defaultLogFields := logrus.Fields{
319349
"taskId": t.ti.TaskID.GetValue(),

0 commit comments

Comments
 (0)