From 15b3254c74a83dfd5b65185689ae57ac7307cf8c Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 1 May 2026 07:24:55 +0200 Subject: [PATCH 1/5] Revert "ci: drop flaky e2e-multipass job; rely on docker e2e as release gate" This reverts commit b86eacf09cb7faa25d6800c135e7c714c5053e21. --- .github/workflows/ci.yaml | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 330717f..87e388a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -71,18 +71,31 @@ jobs: cache: true - run: go test -tags "e2e,docker" -count=1 -timeout=20m ./e2e/ - # No e2e-multipass job here. The Linux multipass install path is - # snap-only, and snapcraft.io reachability from ubuntu-latest is - # too flaky to gate releases on (multiple consecutive runs on - # 2026-05-01 failed every retry against the snap store). The - # //go:build e2e && multipass test in e2e/multipass_test.go still - # exists and runs locally via: - # - # go test -tags 'e2e,multipass' -count=1 -timeout=30m ./e2e/ - # - # so a developer or self-hosted runner with multipass already - # installed can verify the multipass provisioner end-to-end. - # The e2e (docker) job above is the release/image gate. + e2e-multipass: + # Real Multipass + real k3s + real envoy-gateway rollout, plus + # detect/ctr/crictl/images-load coverage and the pod-to- + # apiserver curl smoke test (regression guard for the + # --node-external-ip class of bug). + # + # Runs on every PR and every push to main / tags. We tried + # label-gating on PRs to save ~5 minutes of VM-boot cost, but + # the asymmetry (label on PR; unconditional on main) meant a + # regression that didn't carry the label landed cleanly and + # then turned main red. Symmetric gating avoids that: if the + # leg is the gate, both sides go through it. + runs-on: ubuntu-latest + needs: test + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + cache: true + - run: sudo snap install multipass + - run: | + export KUBECONFIG="$RUNNER_TEMP/kubeconfig" + touch "$KUBECONFIG" + go test -tags "e2e,multipass" -count=1 -timeout=30m ./e2e/ generate-drift: # `go generate` regenerates the provisioner schema files from From b1628a3a24ac8dcda51db3dfbc72fb026fc97875 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 30 Apr 2026 17:07:38 +0000 Subject: [PATCH 2/5] ci(e2e-multipass): chmod 666 the daemon socket after snap install The first post-merge run failed in ~10 s with "multipass socket access denied" on the first `multipass info` call. The snap- installed daemon socket lands at /var/snap/multipass/common/multipass_socket as 0660 root:multipass-admin and the GHA `runner` user isn't in that group; `multipass version` happens to work without the socket so CheckPrerequisites passes and the failure surfaces later in Provision. chmod 666 the socket after install. Crude but the standard CI crutch -- runner is ephemeral, socket isn't shared. The proper alternative (usermod -aG multipass-admin $USER) doesn't take effect mid-shell and would require `sg multipass-admin -c '...'` wrapping the test invocation, which is more invasive than the chmod. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 87e388a..a7b16ff 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -92,6 +92,15 @@ jobs: go-version-file: go.mod cache: true - run: sudo snap install multipass + # The multipass daemon socket lands as 0660 root:multipass-admin + # and the GHA `runner` user isn't in that group. `multipass + # version` doesn't hit the socket so CheckPrerequisites passes, + # but the first call that does (`multipass info`) errors with + # "multipass socket access denied" and the test fails in ~10 s. + # chmod 666 is the standard CI crutch -- the runner is + # ephemeral, the socket isn't shared. usermod -aG would need + # `sg multipass-admin -c ...` wrapping every subsequent step. + - run: sudo chmod 666 /var/snap/multipass/common/multipass_socket - run: | export KUBECONFIG="$RUNNER_TEMP/kubeconfig" touch "$KUBECONFIG" From be9220df4b8aaa7d2bb45339c747268a1ec135fc Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 30 Apr 2026 17:24:11 +0000 Subject: [PATCH 3/5] fix(provision/multipass): write cloud-init seed under $HOME, not /tmp The snap-installed multipass daemon on Linux runs in a confined sandbox. /tmp is not visible inside that sandbox; $HOME is, via the auto-connected `home` interface. The previous code wrote the seed via os.CreateTemp("") which lands in /tmp on Linux, so `multipass launch --cloud-init ` errored with: Could not load cloud-init configuration: bad file: /tmp/... Please ensure that Multipass can read it. Write to ~/.cache/y-cluster-multipass/ instead. Works on: - snap-confined multipass on Linux (sandbox sees \$HOME) - .pkg-installed multipass on macOS (no confinement) - any future packaging Same temp-file naming scheme via os.CreateTemp(dir, prefix), same cleanup-on-defer contract. Surfaced by the e2e-multipass GHA leg with the daemon socket chmod fix in place: that ran multipass launch for the first time and hit this on every invocation, including local snap installs. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/provision/multipass/multipass.go | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/pkg/provision/multipass/multipass.go b/pkg/provision/multipass/multipass.go index ee6e366..4a80eea 100644 --- a/pkg/provision/multipass/multipass.go +++ b/pkg/provision/multipass/multipass.go @@ -17,6 +17,7 @@ import ( "fmt" "io" "os" + "path/filepath" "time" "go.uber.org/zap" @@ -245,12 +246,26 @@ func (c *Cluster) NodeExec(ctx context.Context, command string, stdin io.Reader) // before Provision has resolved it. func (c *Cluster) VMIP() string { return c.vmIP } -// writeCloudInitSeed renders a minimal cloud-config file in -// os.TempDir and returns its path along with a cleanup function. -// multipass reads it via `--cloud-init` once at launch; we don't -// keep it on disk afterwards. No SSH key plumbing -- `multipass -// exec` runs as root over the daemon's IPC channel. +// writeCloudInitSeed renders a minimal cloud-config file under +// ~/.cache/y-cluster-multipass/ and returns its path along with a +// cleanup function. multipass reads it via `--cloud-init` once at +// launch; we don't keep it on disk afterwards. No SSH key plumbing +// -- `multipass exec` runs as root over the daemon's IPC channel. +// +// The path matters: on Linux, multipass ships as a snap and the +// daemon runs confined. /tmp is not visible inside the snap +// sandbox; $HOME is, via the auto-connected `home` interface. +// Writing to a $HOME-rooted cache dir works on snap-confined Linux, +// macOS (.pkg install, no confinement), and any future packaging. func (c *Cluster) writeCloudInitSeed() (string, func(), error) { + home, err := os.UserHomeDir() + if err != nil { + return "", func() {}, fmt.Errorf("resolve home dir: %w", err) + } + dir := filepath.Join(home, ".cache", "y-cluster-multipass") + if err := os.MkdirAll(dir, 0o755); err != nil { + return "", func() {}, fmt.Errorf("create %s: %w", dir, err) + } body := fmt.Sprintf(`#cloud-config hostname: %s users: @@ -259,7 +274,7 @@ users: shell: /bin/bash package_update: false `, c.cfg.Name) - f, err := os.CreateTemp("", c.cfg.Name+"-cloud-init-*.yaml") + f, err := os.CreateTemp(dir, c.cfg.Name+"-cloud-init-*.yaml") if err != nil { return "", func() {}, err } From b407525e379a5f2b373be6e1044f0286629d7210 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 1 May 2026 07:01:30 +0200 Subject: [PATCH 4/5] ci(e2e-multipass): retry snap install through transient store outages snapcraft.io has occasional reachability blips that surface as "unable to contact snap store" and fail the install after ~40 s. The workflow run on b27c6d1 hit this exact flake before any test ran. Bounded retry (4 attempts, 30 s gap) keeps the job green through transient outages without masking a real install failure. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yaml | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a7b16ff..1829ef4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -91,7 +91,28 @@ jobs: with: go-version-file: go.mod cache: true - - run: sudo snap install multipass + - name: Install multipass (with snap-store retry) + # snapcraft.io has occasional outages that surface as "unable + # to contact snap store" and fail the install after ~40 s. A + # bounded retry loop keeps the job green through transient + # outages without masking a real install failure (after + # MAX_ATTEMPTS the job still fails, with the last error + # visible in the log). + run: | + set -e + MAX_ATTEMPTS=4 + for i in $(seq 1 $MAX_ATTEMPTS); do + if sudo snap install multipass; then + echo "multipass installed on attempt $i" + exit 0 + fi + if [ "$i" -lt "$MAX_ATTEMPTS" ]; then + echo "snap install attempt $i failed; retrying in 30s" + sleep 30 + fi + done + echo "snap install failed after $MAX_ATTEMPTS attempts" >&2 + exit 1 # The multipass daemon socket lands as 0660 root:multipass-admin # and the GHA `runner` user isn't in that group. `multipass # version` doesn't hit the socket so CheckPrerequisites passes, From 858313605b1f49db328564db5a050beedd9174cf Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 1 May 2026 06:54:48 +0200 Subject: [PATCH 5/5] fix(provision/multipass): pipe cloud-init via stdin The snap-packaged multipass on Linux runs the daemon under AppArmor confinement. Its auto-connected `home` interface grants read access to $HOME/* but not to hidden directories: ~/.cache/y-cluster-multipass/ fails with "bad file: ..." at multipass launch. /tmp is also private to the snap, so a path-based seed has no portable home. Pipe cloud-init via `--cloud-init -` instead. multipass launch reads the YAML body from stdin; no path, no temp file, no confinement. Same shape on macOS where the daemon is unconfined, so this is a net simplification. Drops writeCloudInitSeed and its on-disk plumbing (no temp dir, no cleanup callback). Local e2e on macOS still green. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/provision/multipass/multipass.go | 67 ++++++++-------------------- 1 file changed, 19 insertions(+), 48 deletions(-) diff --git a/pkg/provision/multipass/multipass.go b/pkg/provision/multipass/multipass.go index 4a80eea..dd12619 100644 --- a/pkg/provision/multipass/multipass.go +++ b/pkg/provision/multipass/multipass.go @@ -16,8 +16,7 @@ import ( "errors" "fmt" "io" - "os" - "path/filepath" + "strings" "time" "go.uber.org/zap" @@ -132,13 +131,7 @@ func Provision(ctx context.Context, cfg Config, logger *zap.Logger) (*Cluster, e c := &Cluster{cfg: cfg, logger: logger, Kubeconfig: kubecfg} - seedPath, cleanupSeed, err := c.writeCloudInitSeed() - if err != nil { - return nil, fmt.Errorf("write cloud-init seed: %w", err) - } - defer cleanupSeed() - - if err := c.launch(ctx, seedPath); err != nil { + if err := c.launch(ctx); err != nil { return nil, err } @@ -246,27 +239,11 @@ func (c *Cluster) NodeExec(ctx context.Context, command string, stdin io.Reader) // before Provision has resolved it. func (c *Cluster) VMIP() string { return c.vmIP } -// writeCloudInitSeed renders a minimal cloud-config file under -// ~/.cache/y-cluster-multipass/ and returns its path along with a -// cleanup function. multipass reads it via `--cloud-init` once at -// launch; we don't keep it on disk afterwards. No SSH key plumbing -// -- `multipass exec` runs as root over the daemon's IPC channel. -// -// The path matters: on Linux, multipass ships as a snap and the -// daemon runs confined. /tmp is not visible inside the snap -// sandbox; $HOME is, via the auto-connected `home` interface. -// Writing to a $HOME-rooted cache dir works on snap-confined Linux, -// macOS (.pkg install, no confinement), and any future packaging. -func (c *Cluster) writeCloudInitSeed() (string, func(), error) { - home, err := os.UserHomeDir() - if err != nil { - return "", func() {}, fmt.Errorf("resolve home dir: %w", err) - } - dir := filepath.Join(home, ".cache", "y-cluster-multipass") - if err := os.MkdirAll(dir, 0o755); err != nil { - return "", func() {}, fmt.Errorf("create %s: %w", dir, err) - } - body := fmt.Sprintf(`#cloud-config +// cloudInitBody is the minimal cloud-config we hand to +// `multipass launch`. No SSH key plumbing -- `multipass exec` +// runs as root over the daemon's IPC channel. +func (c *Cluster) cloudInitBody() string { + return fmt.Sprintf(`#cloud-config hostname: %s users: - name: ystack @@ -274,27 +251,21 @@ users: shell: /bin/bash package_update: false `, c.cfg.Name) - f, err := os.CreateTemp(dir, c.cfg.Name+"-cloud-init-*.yaml") - if err != nil { - return "", func() {}, err - } - if _, err := f.WriteString(body); err != nil { - _ = f.Close() - _ = os.Remove(f.Name()) - return "", func() {}, err - } - if err := f.Close(); err != nil { - _ = os.Remove(f.Name()) - return "", func() {}, err - } - path := f.Name() - return path, func() { _ = os.Remove(path) }, nil } // launch invokes `multipass launch` with the configured shape. // Memory takes the qemu/docker convention of plain MB, which we // convert to multipass's `M` form. -func (c *Cluster) launch(ctx context.Context, seedPath string) error { +// +// cloud-init is piped via stdin (`--cloud-init -`) rather than +// referenced as a file path. The snap-packaged multipass on Linux +// runs the daemon under AppArmor confinement: the auto-connected +// `home` interface grants the daemon access to `$HOME/*` but not +// to hidden dotfiles or directories, and `/tmp` is private to the +// snap. Avoiding the path entirely sidesteps the whole class of +// confinement issues and works identically on macOS where there is +// no confinement. +func (c *Cluster) launch(ctx context.Context) error { c.logger.Info("launching multipass VM", zap.String("name", c.cfg.Name), zap.String("image", c.cfg.Image), @@ -308,10 +279,10 @@ func (c *Cluster) launch(ctx context.Context, seedPath string) error { "--cpus", c.cfg.CPUs, "--memory", c.cfg.Memory + "M", "--disk", diskSize, - "--cloud-init", seedPath, + "--cloud-init", "-", c.cfg.Image, } - out, err := multipassexec.Run(ctx, nil, args...) + out, err := multipassexec.Run(ctx, strings.NewReader(c.cloudInitBody()), args...) if err != nil { return fmt.Errorf("multipass launch: %s: %w", out, err) }