diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 330717f..1829ef4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -71,18 +71,61 @@ jobs: cache: true - run: go test -tags "e2e,docker" -count=1 -timeout=20m ./e2e/ - # No e2e-multipass job here. The Linux multipass install path is - # snap-only, and snapcraft.io reachability from ubuntu-latest is - # too flaky to gate releases on (multiple consecutive runs on - # 2026-05-01 failed every retry against the snap store). The - # //go:build e2e && multipass test in e2e/multipass_test.go still - # exists and runs locally via: - # - # go test -tags 'e2e,multipass' -count=1 -timeout=30m ./e2e/ - # - # so a developer or self-hosted runner with multipass already - # installed can verify the multipass provisioner end-to-end. - # The e2e (docker) job above is the release/image gate. + e2e-multipass: + # Real Multipass + real k3s + real envoy-gateway rollout, plus + # detect/ctr/crictl/images-load coverage and the pod-to- + # apiserver curl smoke test (regression guard for the + # --node-external-ip class of bug). + # + # Runs on every PR and every push to main / tags. We tried + # label-gating on PRs to save ~5 minutes of VM-boot cost, but + # the asymmetry (label on PR; unconditional on main) meant a + # regression that didn't carry the label landed cleanly and + # then turned main red. Symmetric gating avoids that: if the + # leg is the gate, both sides go through it. + runs-on: ubuntu-latest + needs: test + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + cache: true + - name: Install multipass (with snap-store retry) + # snapcraft.io has occasional outages that surface as "unable + # to contact snap store" and fail the install after ~40 s. A + # bounded retry loop keeps the job green through transient + # outages without masking a real install failure (after + # MAX_ATTEMPTS the job still fails, with the last error + # visible in the log). + run: | + set -e + MAX_ATTEMPTS=4 + for i in $(seq 1 $MAX_ATTEMPTS); do + if sudo snap install multipass; then + echo "multipass installed on attempt $i" + exit 0 + fi + if [ "$i" -lt "$MAX_ATTEMPTS" ]; then + echo "snap install attempt $i failed; retrying in 30s" + sleep 30 + fi + done + echo "snap install failed after $MAX_ATTEMPTS attempts" >&2 + exit 1 + # The multipass daemon socket lands as 0660 root:multipass-admin + # and the GHA `runner` user isn't in that group. `multipass + # version` doesn't hit the socket so CheckPrerequisites passes, + # but the first call that does (`multipass info`) errors with + # "multipass socket access denied" and the test fails in ~10 s. + # chmod 666 is the standard CI crutch -- the runner is + # ephemeral, the socket isn't shared. usermod -aG would need + # `sg multipass-admin -c ...` wrapping every subsequent step. + - run: sudo chmod 666 /var/snap/multipass/common/multipass_socket + - run: | + export KUBECONFIG="$RUNNER_TEMP/kubeconfig" + touch "$KUBECONFIG" + go test -tags "e2e,multipass" -count=1 -timeout=30m ./e2e/ generate-drift: # `go generate` regenerates the provisioner schema files from diff --git a/pkg/provision/multipass/multipass.go b/pkg/provision/multipass/multipass.go index ee6e366..dd12619 100644 --- a/pkg/provision/multipass/multipass.go +++ b/pkg/provision/multipass/multipass.go @@ -16,7 +16,7 @@ import ( "errors" "fmt" "io" - "os" + "strings" "time" "go.uber.org/zap" @@ -131,13 +131,7 @@ func Provision(ctx context.Context, cfg Config, logger *zap.Logger) (*Cluster, e c := &Cluster{cfg: cfg, logger: logger, Kubeconfig: kubecfg} - seedPath, cleanupSeed, err := c.writeCloudInitSeed() - if err != nil { - return nil, fmt.Errorf("write cloud-init seed: %w", err) - } - defer cleanupSeed() - - if err := c.launch(ctx, seedPath); err != nil { + if err := c.launch(ctx); err != nil { return nil, err } @@ -245,13 +239,11 @@ func (c *Cluster) NodeExec(ctx context.Context, command string, stdin io.Reader) // before Provision has resolved it. func (c *Cluster) VMIP() string { return c.vmIP } -// writeCloudInitSeed renders a minimal cloud-config file in -// os.TempDir and returns its path along with a cleanup function. -// multipass reads it via `--cloud-init` once at launch; we don't -// keep it on disk afterwards. No SSH key plumbing -- `multipass -// exec` runs as root over the daemon's IPC channel. -func (c *Cluster) writeCloudInitSeed() (string, func(), error) { - body := fmt.Sprintf(`#cloud-config +// cloudInitBody is the minimal cloud-config we hand to +// `multipass launch`. No SSH key plumbing -- `multipass exec` +// runs as root over the daemon's IPC channel. +func (c *Cluster) cloudInitBody() string { + return fmt.Sprintf(`#cloud-config hostname: %s users: - name: ystack @@ -259,27 +251,21 @@ users: shell: /bin/bash package_update: false `, c.cfg.Name) - f, err := os.CreateTemp("", c.cfg.Name+"-cloud-init-*.yaml") - if err != nil { - return "", func() {}, err - } - if _, err := f.WriteString(body); err != nil { - _ = f.Close() - _ = os.Remove(f.Name()) - return "", func() {}, err - } - if err := f.Close(); err != nil { - _ = os.Remove(f.Name()) - return "", func() {}, err - } - path := f.Name() - return path, func() { _ = os.Remove(path) }, nil } // launch invokes `multipass launch` with the configured shape. // Memory takes the qemu/docker convention of plain MB, which we // convert to multipass's `M` form. -func (c *Cluster) launch(ctx context.Context, seedPath string) error { +// +// cloud-init is piped via stdin (`--cloud-init -`) rather than +// referenced as a file path. The snap-packaged multipass on Linux +// runs the daemon under AppArmor confinement: the auto-connected +// `home` interface grants the daemon access to `$HOME/*` but not +// to hidden dotfiles or directories, and `/tmp` is private to the +// snap. Avoiding the path entirely sidesteps the whole class of +// confinement issues and works identically on macOS where there is +// no confinement. +func (c *Cluster) launch(ctx context.Context) error { c.logger.Info("launching multipass VM", zap.String("name", c.cfg.Name), zap.String("image", c.cfg.Image), @@ -293,10 +279,10 @@ func (c *Cluster) launch(ctx context.Context, seedPath string) error { "--cpus", c.cfg.CPUs, "--memory", c.cfg.Memory + "M", "--disk", diskSize, - "--cloud-init", seedPath, + "--cloud-init", "-", c.cfg.Image, } - out, err := multipassexec.Run(ctx, nil, args...) + out, err := multipassexec.Run(ctx, strings.NewReader(c.cloudInitBody()), args...) if err != nil { return fmt.Errorf("multipass launch: %s: %w", out, err) }