Skip to content

Commit c03ae0f

Browse files
authored
Bug fix for leaked LCOW resources; expand logs (#2655)
* Bug fix for leaked LCOW resources; expand logs Incorrect resource closures did not cause errors in non-virtual pod scenarios since the uVM was closed after sandbox exit, so leaked resources where released then. Fix bug where LCOW adapter removal incorrectly used the endpoint ID as the namespace ID. Don't attempt to remove virtual pod endpoints before host removes adapters from host. Unmount and delete virtual pod directories and mounts. Add more logs to networking and resource code paths to help debug leaked vPCI endpoint devices. Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com> * PR: spelling, error handling Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com> --------- Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com>
1 parent dca9708 commit c03ae0f

13 files changed

Lines changed: 372 additions & 153 deletions

File tree

Protobuild.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ generators = ["go", "go-grpc"]
66
before = ["./protobuf"]
77

88
# defaults are "/usr/local/include" and "/usr/include", which don't exist on Windows.
9-
# override defaults to supress errors about non-existant directories.
9+
# override defaults to supress errors about non-existent directories.
1010
after = []
1111

1212
# This section maps protobuf imports to Go packages.

cmd/containerd-shim-runhcs-v1/task_hcs.go

Lines changed: 54 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,11 @@ func newHcsTask(
188188
ownsParent bool,
189189
req *task.CreateTaskRequest,
190190
s *specs.Spec) (_ shimTask, err error) {
191-
log.G(ctx).WithFields(logrus.Fields{
192-
"tid": req.ID,
193-
"ownsParent": ownsParent,
191+
ctx, entry := log.SetEntry(ctx, logrus.Fields{logfields.TaskID: req.ID})
192+
entry.WithFields(logrus.Fields{
193+
"hasHost": parent != nil,
194+
"isVirtualPod": s.Annotations != nil && s.Annotations[annotations.VirtualPodID] != "",
195+
"ownsHost": ownsParent,
194196
}).Debug("newHcsTask")
195197

196198
owner := filepath.Base(os.Args[0])
@@ -495,22 +497,30 @@ func (ht *hcsTask) DeleteExec(ctx context.Context, eid string) (int, uint32, tim
495497
}
496498

497499
if eid == "" {
500+
entry := log.G(ctx).WithFields(logrus.Fields{
501+
logfields.TaskID: ht.id,
502+
logfields.ExecID: eid,
503+
})
504+
498505
// We are killing the init task, so we expect the container to be
499506
// stopped after this.
500507
//
501508
// The task process may have already exited, and the status set to
502509
// shimExecStateExited, but resources may still be in the process
503510
// of being cleaned up. Wait for ht.closed to be closed. This signals
504-
// that waitInitExit() has finished destroying container resources,
505-
// and layers were umounted.
511+
// that [hcsTask.waitInitExit]/[hcsTask.waitForHostExit] has finished destroying
512+
// container resources and that layers were umounted.
506513
// If the shim exits before resources are cleaned up, those resources
507514
// will remain locked and untracked, which leads to lingering sandboxes
508515
// and container resources like base vhdx.
516+
const timeout = 30 * time.Second
517+
entry.WithField(logfields.Timeout, timeout).Trace("waiting for task to be closed")
509518
select {
510-
case <-time.After(30 * time.Second):
511-
log.G(ctx).Error("timed out waiting for resource cleanup")
519+
case <-time.After(timeout):
520+
entry.WithField(logfields.Timeout, timeout).Error("timed out waiting for task to close while deleting init exec")
512521
return 0, 0, time.Time{}, errors.Wrap(hcs.ErrTimeout, "waiting for container resource cleanup")
513522
case <-ht.closed:
523+
entry.Trace("received task close signal")
514524
}
515525

516526
// The init task has now exited. A ForceExit() has already been sent to
@@ -608,7 +618,10 @@ func (ht *hcsTask) Wait() *task.StateResponse {
608618
func (ht *hcsTask) waitInitExit() {
609619
ctx, span := oc.StartSpan(context.Background(), "hcsTask::waitInitExit")
610620
defer span.End()
611-
span.AddAttributes(trace.StringAttribute("tid", ht.id))
621+
span.AddAttributes(
622+
trace.StringAttribute("tid", ht.id),
623+
trace.BoolAttribute("host", ht.host != nil),
624+
trace.BoolAttribute("ownsHost", ht.ownsHost))
612625

613626
// Wait for it to exit on its own
614627
ht.init.Wait()
@@ -627,7 +640,10 @@ func (ht *hcsTask) waitInitExit() {
627640
func (ht *hcsTask) waitForHostExit() {
628641
ctx, span := oc.StartSpan(context.Background(), "hcsTask::waitForHostExit")
629642
defer span.End()
630-
span.AddAttributes(trace.StringAttribute("tid", ht.id))
643+
span.AddAttributes(
644+
trace.StringAttribute("tid", ht.id),
645+
trace.BoolAttribute("host", ht.host != nil),
646+
trace.BoolAttribute("ownsHost", ht.ownsHost))
631647

632648
err := ht.host.WaitCtx(ctx)
633649
if err != nil {
@@ -655,13 +671,16 @@ func (ht *hcsTask) waitForHostExit() {
655671
// == nil`.
656672
func (ht *hcsTask) close(ctx context.Context) {
657673
ht.closeOnce.Do(func() {
658-
log.G(ctx).Debug("hcsTask::closeOnce")
674+
entry := log.G(ctx)
675+
entry.Debug("hcsTask::closeOnce")
659676

660677
// ht.c should never be nil for a real task but in testing we stub
661678
// this to avoid a nil dereference. We really should introduce a
662679
// method or interface for ht.c operations that we can stub for
663680
// testing.
664681
if ht.c != nil {
682+
const tearDownTimeout = 30 * time.Second
683+
665684
// Do our best attempt to tear down the container.
666685
// TODO: unify timeout select statements and use [ht.c.WaitCtx] and [context.WithTimeout]
667686
var werr error
@@ -670,52 +689,63 @@ func (ht *hcsTask) close(ctx context.Context) {
670689
werr = ht.c.Wait()
671690
close(ch)
672691
}()
692+
673693
err := ht.c.Shutdown(ctx)
674694
if err != nil {
675-
log.G(ctx).WithError(err).Error("failed to shutdown container")
695+
entry.WithError(err).Error("failed to shutdown container")
676696
} else {
677-
t := time.NewTimer(time.Second * 30)
697+
t := time.NewTimer(tearDownTimeout)
678698
select {
679699
case <-ch:
680700
err = werr
681701
t.Stop()
682702
if err != nil {
683-
log.G(ctx).WithError(err).Error("failed to wait for container shutdown")
703+
entry.WithError(err).Error("failed to wait for container shutdown")
684704
}
685705
case <-t.C:
686706
err = hcs.ErrTimeout
687-
log.G(ctx).WithError(err).Error("failed to wait for container shutdown")
707+
entry.WithFields(logrus.Fields{
708+
logfields.Timeout: tearDownTimeout,
709+
logrus.ErrorKey: err,
710+
}).Error("timed out while waiting for container shutdown")
688711
}
689712
}
690713

691714
if err != nil {
692715
err = ht.c.Terminate(ctx)
693716
if err != nil {
694-
log.G(ctx).WithError(err).Error("failed to terminate container")
717+
entry.WithError(err).Error("failed to terminate container")
695718
} else {
696-
t := time.NewTimer(time.Second * 30)
719+
t := time.NewTimer(tearDownTimeout)
697720
select {
698721
case <-ch:
699722
err = werr
700723
t.Stop()
701724
if err != nil {
702-
log.G(ctx).WithError(err).Error("failed to wait for container terminate")
725+
entry.WithError(err).Error("failed to wait for container terminate")
703726
}
704727
case <-t.C:
705-
log.G(ctx).WithError(hcs.ErrTimeout).Error("failed to wait for container terminate")
728+
entry.WithFields(logrus.Fields{
729+
logfields.Timeout: tearDownTimeout,
730+
logrus.ErrorKey: hcs.ErrTimeout,
731+
}).Error("timed out while waiting for container terminate")
706732
}
707733
}
708734
}
709735

710736
// Release any resources associated with the container.
737+
entry.Trace("starting task resource cleanup")
738+
711739
if err := resources.ReleaseResources(ctx, ht.cr, ht.host, true); err != nil {
712-
log.G(ctx).WithError(err).Error("failed to release container resources")
740+
entry.WithError(err).Error("failed to release container resources")
713741
}
714742

715743
// Close the container handle invalidating all future access.
716744
if err := ht.c.Close(); err != nil {
717-
log.G(ctx).WithError(err).Error("failed to close container")
745+
entry.WithError(err).Error("failed to close container")
718746
}
747+
748+
entry.Trace("task resource cleanup completed")
719749
}
720750
ht.closeHost(ctx)
721751
})
@@ -730,11 +760,12 @@ func (ht *hcsTask) close(ctx context.Context) {
730760
// This call is idempotent and safe to call multiple times.
731761
func (ht *hcsTask) closeHost(ctx context.Context) {
732762
ht.closeHostOnce.Do(func() {
733-
log.G(ctx).Debug("hcsTask::closeHostOnce")
763+
entry := log.G(ctx)
764+
entry.Debug("hcsTask::closeHostOnce")
734765

735766
if ht.ownsHost && ht.host != nil {
736767
if err := ht.host.Close(); err != nil {
737-
log.G(ctx).WithError(err).Error("failed host vm shutdown")
768+
entry.WithError(err).Error("failed host vm shutdown")
738769
}
739770
}
740771
// Send the `init` exec exit notification always.
@@ -750,7 +781,7 @@ func (ht *hcsTask) closeHost(ctx context.Context) {
750781
ExitStatus: exit.ExitStatus,
751782
ExitedAt: exit.ExitedAt,
752783
}); err != nil {
753-
log.G(ctx).WithError(err).Error("failed to publish TaskExitEventTopic")
784+
entry.WithError(err).Error("failed to publish TaskExitEventTopic")
754785
}
755786
close(ht.closed)
756787
})

internal/guest/network/netns.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ func NetNSConfig(ctx context.Context, ifStr string, nsPid int, adapter *guestres
181181
addrsStr = append(addrsStr, fmt.Sprintf("%v", addr))
182182
}
183183

184-
entry.WithField("addresses", addrsStr).Debugf("%v: %s[idx=%d,type=%s] is %v",
184+
entry.WithField("addresses", strings.Join(addrsStr, "; ")).Debugf("%v: %s[idx=%d,type=%s] is %v",
185185
curNS, attr.Name, attr.Index, link.Type(), attr.OperState)
186186
}
187187

@@ -209,7 +209,7 @@ func configureLink(ctx context.Context,
209209
addr.IP = ip
210210
log.G(ctx).WithFields(logrus.Fields{
211211
"allocatedIP": ip,
212-
"IP": addr,
212+
"IP": addr.String(),
213213
}).Debugf("parsed ip address %s/%d", ipConfig.IPAddress, ipConfig.PrefixLength)
214214
ipAddr := &netlink.Addr{IPNet: addr, Label: ""}
215215
if err := netlinkAddrAdd(link, ipAddr); err != nil {
@@ -237,7 +237,7 @@ func configureLink(ctx context.Context,
237237
}
238238

239239
for _, r := range adapter.Routes {
240-
log.G(ctx).WithField("route", r).Debugf("adding a route to interface %s", link.Attrs().Name)
240+
log.G(ctx).WithField("route", log.Format(ctx, r)).Debugf("adding a route to interface %s", link.Attrs().Name)
241241

242242
if (r.DestinationPrefix == ipv4GwDestination || r.DestinationPrefix == ipv6GwDestination) &&
243243
(r.NextHop == ipv4EmptyGw || r.NextHop == ipv6EmptyGw) {

0 commit comments

Comments
 (0)