From 160e4821bc8610b1d81a2e28786dbce5c87a72d8 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Mon, 4 May 2026 15:20:43 +0000 Subject: [PATCH 01/16] feat(scripts): appliance build / publish / e2e drivers Brings every scripts/ change from agents/appliance-export-import to upstream main as a single bump. The Go and testdata side of this work landed in #19 (appliance-primitives); this commit is the operator-facing bash that drives the released binary through the appliance lifecycle, plus the .env-style config the scripts source. What's new vs main: - appliance-build-hetzner.sh / appliance-build-virtualbox.sh: interactive build flows producing a .qcow2 + a VirtualBox- importable .ova respectively, both via the released y-cluster binary's prepare-export and export subcommands. - appliance-publish-hetzner.sh: pushes a built appliance to Hetzner Object Storage for handoff. - appliance-qemu-to-gcp.sh: end-to-end qemu -> GCP custom image flow (export --format=gcp-tar -> gsutil cp -> compute images create) with persistent /data/yolean disk preserved across redeploys, plus a teardown subcommand. - gcp-bootstrap-credentials.sh: one-shot bootstrap for the service account / project / key file the GCP flow needs. - e2e-appliance-export-import.sh: local qemu -> qemu round- trip exercising the full prepare-export / export / import cycle without any cloud cred dependency. - e2e-appliance-hetzner.{sh,pkr.hcl}: Packer-based snapshot flow; lays the snapshot down once, spins fresh servers on top to verify boot. - e2e-appliance-qemu-to-gcp.sh: non-interactive driver of appliance-qemu-to-gcp.sh end to end, including teardown. - .env.example + .gitignore: documents every overridable knob (GCP_PROJECT, GCP_KEY, H_S3_ENV_FILE, ENV_FILE) with a generic example path; .env stays out of git. Configuration: required values are operator-supplied via env vars (no built-in defaults). Each script derives REPO_ROOT from BASH_SOURCE and sources $REPO_ROOT/.env via `set -o allexport` when present, so the .env path works regardless of CWD (including `cd /tmp && bash /path/to/script`). Missing required values fail fast with a clear "set $VAR in .env or shell env" message. Scope: scripts/ + repo-root .env plumbing. The Go side is already on main via #19. Both `go build ./...` and `go test ./...` are unchanged-clean on this branch -- the scripts add no go.mod or testdata edits. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 35 ++ .gitignore | 3 + scripts/appliance-build-hetzner.sh | 413 +++++++++++++++ scripts/appliance-build-virtualbox.sh | 247 +++++++++ scripts/appliance-publish-hetzner.sh | 312 ++++++++++++ scripts/appliance-qemu-to-gcp.sh | 680 +++++++++++++++++++++++++ scripts/e2e-appliance-export-import.sh | 312 ++++++++++++ scripts/e2e-appliance-hetzner.pkr.hcl | 252 +++++++++ scripts/e2e-appliance-hetzner.sh | 306 +++++++++++ scripts/e2e-appliance-qemu-to-gcp.sh | 382 ++++++++++++++ scripts/gcp-bootstrap-credentials.sh | 148 ++++++ 11 files changed, 3090 insertions(+) create mode 100644 .env.example create mode 100755 scripts/appliance-build-hetzner.sh create mode 100755 scripts/appliance-build-virtualbox.sh create mode 100755 scripts/appliance-publish-hetzner.sh create mode 100755 scripts/appliance-qemu-to-gcp.sh create mode 100755 scripts/e2e-appliance-export-import.sh create mode 100644 scripts/e2e-appliance-hetzner.pkr.hcl create mode 100755 scripts/e2e-appliance-hetzner.sh create mode 100755 scripts/e2e-appliance-qemu-to-gcp.sh create mode 100755 scripts/gcp-bootstrap-credentials.sh diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f687ae0 --- /dev/null +++ b/.env.example @@ -0,0 +1,35 @@ +# Copy to .env (gitignored) and fill in your values: +# cp .env.example .env +# +# Sourced from the repo root by the scripts under scripts/. Each +# script bails with `set GCP_PROJECT in .env` (or similar) if a +# required value is unset. + +# === GCP appliance flow (scripts/appliance-qemu-to-gcp.sh) === + +# GCP project where appliance images, VMs, and the persistent +# data disk live. The bundled service account +# (gcp-bootstrap-credentials.sh) must have roles/owner here. +# Example: my-org-appliance +GCP_PROJECT= + +# Path to the GCP service-account JSON. Created via +# scripts/gcp-bootstrap-credentials.sh on a machine with +# gcloud Owner access; copy the JSON to this path on the +# build machine and chmod 600. +# Example: ~/.config/y-cluster/gcp-appliance.key.json +GCP_KEY= + +# === Hetzner Object Storage publish (scripts/appliance-publish-hetzner.sh) === + +# Path to a file containing HCLOUD_TOKEN and Hetzner Object +# Storage credentials. Format documented in the script's header. +# Example: ~/.config/y-cluster/hetzner.env +H_S3_ENV_FILE= + +# === Hetzner Cloud Packer flow (scripts/e2e-appliance-hetzner.sh) === + +# Same file as H_S3_ENV_FILE when HCLOUD_TOKEN + S3 creds +# co-locate; separate ENV_FILE var historically. +# Example: ~/.config/y-cluster/hetzner.env +ENV_FILE= diff --git a/.gitignore b/.gitignore index 3d0a7ca..e5be3bd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ /kustomize-traverse /dist/ + +# operator-local defaults sourced by the appliance scripts +.env diff --git a/scripts/appliance-build-hetzner.sh b/scripts/appliance-build-hetzner.sh new file mode 100755 index 0000000..ee91740 --- /dev/null +++ b/scripts/appliance-build-hetzner.sh @@ -0,0 +1,413 @@ +#!/usr/bin/env bash +# Build a y-cluster appliance interactively: stand up a local +# qemu cluster with the same fixtures we'll ship, give the +# operator a chance to poke at it, then on confirm run a +# Packer-built Hetzner snapshot and provision a server from +# it. Shows ssh + curl details for both stages. +# +# Why two clusters: the local one is for hands-on verification +# (kubectl / ssh / poke). The Hetzner one is the actual handoff. +# They're built from the same testdata fixtures, so verifying +# locally proves the fixture set; Packer rebuilds the snapshot +# fresh inside Hetzner. No round-trip artefact transfer between +# the two -- they're independent builds with shared inputs. +# +# Two confirmations: +# 1. "Local cluster looks good -- build Hetzner snapshot?" +# 2. "Snapshot ready -- create server from snapshot?" +# Either prompt aborts non-destructively. Aborting at (1) +# leaves the local cluster up; aborting at (2) leaves the +# Hetzner snapshot in your project for later use. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='appliance-build-hetzner.sh - local verify -> confirm -> Packer snapshot -> confirm -> Hetzner server + +Usage: appliance-build-hetzner.sh + +Environment: + ENV_FILE Hetzner credentials file (set in .env or shell env; required) + HCLOUD_TOKEN Hetzner Cloud API token (sourced from ENV_FILE) + NAME Local cluster name (default: appliance-hetzner-build) + APP_HTTP_PORT Local host port -> guest 80 (default: 39080) + APP_API_PORT Local host port -> guest 6443 (default: 39443) + APP_SSH_PORT Local host port -> guest 22 (default: 2229) + SERVER_NAME Hetzner server name (default: y-cluster-appliance) + SERVER_TYPE Hetzner server type (default: cx23) + SERVER_LOCATION Hetzner location (default: hel1) + SNAPSHOT_NAME Packer snapshot description (default: y-cluster-appliance-) + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) + KEEP_LOCAL Set to keep the local cluster after Hetzner deploy (default: tear down) + ASSUME_YES Set to skip BOTH confirmations and proceed end-to-end + +Dependencies: + go, qemu-system-x86_64, kubectl, ssh, ssh-keygen, curl, packer, hcloud +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${ENV_FILE:?set ENV_FILE in .env or shell env}" + +NAME="${NAME:-appliance-hetzner-build}" +APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_API_PORT="${APP_API_PORT:-39443}" +APP_SSH_PORT="${APP_SSH_PORT:-2229}" +SERVER_NAME="${SERVER_NAME:-y-cluster-appliance}" +SERVER_TYPE="${SERVER_TYPE:-cx23}" +SERVER_LOCATION="${SERVER_LOCATION:-hel1}" +SNAPSHOT_NAME="${SNAPSHOT_NAME:-y-cluster-appliance-$(date -u +%Y%m%d-%H%M%S)}" + +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" +PACKER_TEMPLATE="$REPO_ROOT/scripts/e2e-appliance-hetzner.pkr.hcl" + +# Keep CFG_DIR stable + outside CACHE_DIR (the cleanup glob in the +# qemu provisioner would otherwise match this directory and rm -f +# would bail, killing the script under set -e). Same convention as +# scripts/appliance-build-virtualbox.sh. +CFG_DIR="${CFG_DIR:-$HOME/.cache/y-cluster-appliance-build/$NAME}" + +# Stable location for the per-deploy ssh key so the operator can +# ssh into the Hetzner server later. Survives across script runs +# unless they delete the file or run with a fresh SERVER_NAME. +HCLOUD_KEY_DIR="$HOME/.cache/y-cluster-appliance-build/hetzner-keys" +HCLOUD_KEY="$HCLOUD_KEY_DIR/$SERVER_NAME" + +stage() { printf '\n=== %s ===\n' "$*"; } + +confirm() { + local prompt=$1 + if [[ -n "${ASSUME_YES:-}" ]]; then + echo "ASSUME_YES set; proceeding ($prompt)" + return 0 + fi + read -r -p "$prompt [y/N] " answer + case "${answer,,}" in + y|yes) return 0 ;; + *) return 1 ;; + esac +} + +cat <<'WARN' + +================================================================ +DEPRECATION WARNING + +scripts/appliance-build-hetzner.sh is on the way out. + +Hetzner Cloud has no public API for uploading custom disk +images, so this script's "build a Hetzner snapshot" stage is +a fresh build inside Hetzner via Packer -- the local-qemu +verification you do first is fixture-equivalence, NOT the same +disk that ships. That mismatches the appliance contract +(local-built disk = disk that boots elsewhere). + +Replacement plan: + - scripts/appliance-qemu-to-gcp.sh (in progress) takes the + appliance contract path: provision local, export disk, + upload to GCP via `gcloud compute images import`, boot a + VM from that uploaded image. Same disk you verified + locally is the disk GCP runs. + - scripts/e2e-appliance-hetzner.sh is being repurposed once + a pkg/provision/hetzner/ provisioner exists; it will then + cover provision-on-Hetzner -> snapshot -> instantiate as + an end-to-end test of that provisioner shape. + +This script still runs. It still produces a working appliance +on Hetzner. But the artefact you ship is built fresh on +Hetzner, not transferred from your local verification. +================================================================ + +WARN +confirm "Proceed with the Hetzner Packer flow anyway?" \ + || { echo "aborted; no changes made."; exit 0; } + + +for tool in go qemu-system-x86_64 kubectl ssh ssh-keygen curl packer hcloud; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +if [[ ! -f "$ENV_FILE" ]]; then + echo "missing env file: $ENV_FILE (need HCLOUD_TOKEN)" >&2 + exit 1 +fi +# shellcheck disable=SC1090 +source "$ENV_FILE" +[[ -n "${HCLOUD_TOKEN:-}" ]] || { echo "HCLOUD_TOKEN not set in $ENV_FILE" >&2; exit 1; } +export HCLOUD_TOKEN + +# === Build dev binary (linux/amd64 because Packer uploads it) === +stage "building linux/amd64 dev binary -> $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && GOOS=linux GOARCH=amd64 go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === Local config === +mkdir -p "$CFG_DIR" +cat > "$CFG_DIR/y-cluster-provision.yaml" <&2 + return 1 +} +probe_local echo "http://127.0.0.1:$APP_HTTP_PORT/q/envoy/echo" +probe_local s3 "http://127.0.0.1:$APP_HTTP_PORT/s3/health" + +cat </dev/null || true # y-script-lint:disable=or-true # cleanup best-effort +fi + +# Pre-render the kustomize bases for Packer (the build VM doesn't +# have y-cluster, so it can't run yconverge; concat both module +# outputs into a single kubectl-applyable file). Same shape as +# scripts/e2e-appliance-hetzner.sh. +STATEFUL_MANIFEST=$(mktemp -t appliance-stateful.XXXXXX.yaml) +{ + kubectl kustomize "$REPO_ROOT/testdata/appliance-stateful/namespace" + echo '---' + kubectl kustomize "$REPO_ROOT/testdata/appliance-stateful/base" +} > "$STATEFUL_MANIFEST" + +LOCALSTORAGE_MANIFEST=$(mktemp -t y-cluster-localstorage.XXXXXX.yaml) +"$Y_CLUSTER" localstorage render > "$LOCALSTORAGE_MANIFEST" + +trap 'rm -f "$STATEFUL_MANIFEST" "$LOCALSTORAGE_MANIFEST"' EXIT + +stage "packer init" +packer init "$PACKER_TEMPLATE" + +stage "packer build (creates a temporary $SERVER_TYPE in $SERVER_LOCATION, snapshots, deletes)" +packer build \ + -var "snapshot_name=$SNAPSHOT_NAME" \ + -var "server_type=$SERVER_TYPE" \ + -var "location=$SERVER_LOCATION" \ + -var "y_cluster_binary=$Y_CLUSTER" \ + -var "prepare_script=$REPO_ROOT/pkg/provision/qemu/prepare_inguest.sh" \ + -var "stateful_manifest=$STATEFUL_MANIFEST" \ + -var "localstorage_manifest=$LOCALSTORAGE_MANIFEST" \ + "$PACKER_TEMPLATE" + +# Resolve snapshot id from the description we gave Packer. +stage "resolving snapshot id for $SNAPSHOT_NAME" +SNAPSHOT_ID=$(hcloud image list \ + --type=snapshot \ + --selector="purpose=y-cluster-appliance" \ + --output=json \ + | python3 -c " +import json, sys +images = json.load(sys.stdin) +matches = [i for i in images if i.get('description') == '$SNAPSHOT_NAME'] +if not matches: + sys.exit('no snapshot named $SNAPSHOT_NAME found') +print(matches[0]['id']) +") +echo " snapshot id: $SNAPSHOT_ID" + +cat < +================================================================ + +EOF + +confirm "Create Hetzner server from snapshot $SNAPSHOT_ID?" \ + || { echo "aborted; snapshot $SNAPSHOT_ID preserved for later use."; exit 0; } + +# === Stage 3: create server + verify === +mkdir -p "$HCLOUD_KEY_DIR" +chmod 700 "$HCLOUD_KEY_DIR" +if [[ ! -f "$HCLOUD_KEY" ]]; then + ssh-keygen -t ed25519 -N '' -C "$SERVER_NAME-$$" -f "$HCLOUD_KEY" -q +fi +KEY_NAME="$SERVER_NAME" + +stage "tearing down any leftover server / key from a prior run" +hcloud server delete "$SERVER_NAME" 2>/dev/null || true # y-script-lint:disable=or-true # idempotent cleanup: missing server is not an error +hcloud ssh-key delete "$KEY_NAME" 2>/dev/null || true # y-script-lint:disable=or-true # idempotent cleanup: missing key is not an error + +stage "registering ssh public key as $KEY_NAME" +hcloud ssh-key create --name "$KEY_NAME" --public-key-from-file "$HCLOUD_KEY.pub" >/dev/null + +stage "creating $SERVER_NAME from snapshot $SNAPSHOT_ID" +hcloud server create \ + --name "$SERVER_NAME" \ + --type "$SERVER_TYPE" \ + --image "$SNAPSHOT_ID" \ + --location "$SERVER_LOCATION" \ + --ssh-key "$KEY_NAME" \ + >/dev/null +PUBLIC_IP=$(hcloud server ip "$SERVER_NAME") +echo " public ip: $PUBLIC_IP" + +# Wait for sshd, then probe the workload endpoints. +SSH_OPTS="-i $HCLOUD_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" +echo " waiting for ssh on $PUBLIC_IP:22" +for _ in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS root@"$PUBLIC_IP" 'true' 2>/dev/null; then + break + fi + sleep 5 +done + +# Cold boot from snapshot: cloud-init -> k3s.service first start -> +# envoy gateway controller + data plane -> VersityGW StatefulSet +# rebinds its PV -> klipper-lb binds :80. Generous loop. +probe_remote() { + local what=$1 url=$2 attempts=${3:-60} + local out + out=$(mktemp) + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o "$out" -w " $what HTTP %{http_code}\n" "$url"; then + echo + echo "=== $what response (head) ===" + head -25 "$out" + echo + rm -f "$out" + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 10 + done + rm -f "$out" + return 1 +} + +stage "probing http://$PUBLIC_IP -- echo + s3" +if probe_remote echo "http://$PUBLIC_IP/q/envoy/echo" \ + && probe_remote s3 "http://$PUBLIC_IP/s3/health"; then + cat < k3s-$SERVER_NAME.yaml + KUBECONFIG=k3s-$SERVER_NAME.yaml kubectl get nodes + (k3s's apiserver isn't open to the internet by default; either + add 6443 to the Hetzner firewall, or tunnel via ssh: + ssh -L 6443:127.0.0.1:6443 -N root@$PUBLIC_IP &) + +When you're done: + hcloud server delete $SERVER_NAME + hcloud ssh-key delete $KEY_NAME + hcloud image delete $SNAPSHOT_ID # optional; snapshot is reusable +================================================================ +EOF + exit 0 +fi + +echo >&2 +echo "echo never answered. Server $SERVER_NAME left running for diagnosis:" >&2 +# shellcheck disable=SC2086 +ssh $SSH_OPTS root@"$PUBLIC_IP" 'systemctl is-active k3s; kubectl get pods -A 2>&1 | head -30' >&2 \ + || true # y-script-lint:disable=or-true # diagnostic best-effort +echo " ssh: ssh -i $HCLOUD_KEY root@$PUBLIC_IP" >&2 +echo " destroy: hcloud server delete $SERVER_NAME" >&2 +exit 1 diff --git a/scripts/appliance-build-virtualbox.sh b/scripts/appliance-build-virtualbox.sh new file mode 100755 index 0000000..923aaa6 --- /dev/null +++ b/scripts/appliance-build-virtualbox.sh @@ -0,0 +1,247 @@ +#!/usr/bin/env bash +# Build a y-cluster appliance and pause for hands-on testing +# before exporting a VirtualBox-friendly bundle. +# +# Same provision shape as scripts/e2e-appliance-export-import.sh: +# qemu provider, k3s + Envoy Gateway, echo workload, VersityGW +# StatefulSet (covers stateful PV path). Then it stops, prints +# kubectl + ssh access info, and waits for the operator to +# confirm before running prepare-export + export. +# +# Why interactive: the VirtualBox handoff is precious. We want +# the operator to sanity-check the live cluster before we lock +# the disk for export and (optionally) tear it down. Yes lets +# y-cluster prepare-export + export run; "no" leaves the cluster +# up for further poking (and prints the teardown command). +# +# The bundled VMDK uses subformat=monolithicSparse, which +# imports more cleanly under VirtualBox's "Use Existing Virtual +# Hard Disk File" than the streamOptimized default that ships +# for ESXi. The README inside the bundle documents both. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='appliance-build-virtualbox.sh - provision -> install -> pause -> export VirtualBox-friendly VMDK + +Usage: appliance-build-virtualbox.sh [bundle-dir] + +Positional: + bundle-dir Where to write the export bundle. Default: + ./dist/appliance-virtualbox/- + +Environment: + NAME Appliance name (default: appliance-virtualbox) + APP_HTTP_PORT Host port -> guest 80 (default: 39080) + APP_API_PORT Host port -> guest 6443 (default: 39443) + APP_SSH_PORT Host port -> guest 22 (default: 2229) + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) + KEEP_CLUSTER Set to keep the cluster alive after export (default: tear it down) + SKIP_PROVISION Set to skip provision + install (resume into the prompt against + an already-running cluster of the same NAME) + ASSUME_YES Set to skip the interactive prompt and proceed to export + +Dependencies: + go, qemu-system-x86_64, qemu-img, kubectl, ssh, ssh-keygen, curl, virt-sysprep +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +NAME="${NAME:-appliance-virtualbox}" +APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_API_PORT="${APP_API_PORT:-39443}" +APP_SSH_PORT="${APP_SSH_PORT:-2229}" + +REPO_ROOT="$(git rev-parse --show-toplevel)" +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" + +DEFAULT_BUNDLE="$REPO_ROOT/dist/appliance-virtualbox/$NAME-$(date -u +%Y%m%dT%H%M%SZ)" +BUNDLE_DIR="${1:-$DEFAULT_BUNDLE}" + +# CFG_DIR lives OUTSIDE $CACHE_DIR on purpose: the cleanup glob +# below ("$CACHE_DIR/$NAME-"*) would otherwise match a config +# directory whose name starts with $NAME, and rm -f bails on +# directories under set -e. Keep it stable (not mktemp -d) so +# SKIP_PROVISION can resume against an existing cluster. +CFG_DIR="${CFG_DIR:-$HOME/.cache/y-cluster-appliance-build/$NAME}" + +stage() { printf '\n=== %s ===\n' "$*"; } + +for tool in go qemu-system-x86_64 qemu-img kubectl ssh ssh-keygen curl virt-sysprep; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +# virt-sysprep needs to read /boot/vmlinuz-* (libguestfs supermin +# builds an appliance VM with the host kernel). Ubuntu installs +# kernel images 0600 root, so non-root invocations bail with an +# opaque "supermin exited with error status 1". Surface the fix. +if ! [ -r /boot/vmlinuz-"$(uname -r)" ]; then + cat >&2 < $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === Config (always written; teardown + prepare-export need it) === +mkdir -p "$CFG_DIR" +cat > "$CFG_DIR/y-cluster-provision.yaml" <&2 + return 1 + } + probe echo "http://127.0.0.1:$APP_HTTP_PORT/q/envoy/echo" + probe s3 "http://127.0.0.1:$APP_HTTP_PORT/s3/health" +else + stage "SKIP_PROVISION set; resuming against existing $NAME cluster" +fi + +# === Interactive pause for hands-on testing === +SSH_KEY="$CACHE_DIR/$NAME-ssh" + +cat < Import Appliance wizard accepts +# only OVF / OVA, NOT raw VMDK -- so we ship OVA. The OVF +# carries the CPU/RAM/NIC hints; VirtualBox just needs port +# forwards added post-import. +stage "exporting OVA (VirtualBox-importable) -> $BUNDLE_DIR" +mkdir -p "$(dirname "$BUNDLE_DIR")" +"$Y_CLUSTER" export \ + --context="$NAME" \ + --format=ova \ + "$BUNDLE_DIR" + +ls -la "$BUNDLE_DIR/" +echo +echo " bundled .ova members:" +tar tvf "$BUNDLE_DIR/$NAME.ova" | sed 's/^/ /' + +cat < Import Appliance -> select $BUNDLE_DIR/$NAME.ova + 2. Confirm CPU / RAM / disk on the wizard (defaults come + from the OVF: $(awk '/cpus/{print $2}' "$CFG_DIR/y-cluster-provision.yaml") vCPU, $(awk '/memory/{print $2}' "$CFG_DIR/y-cluster-provision.yaml") MiB RAM) + 3. After import: Network -> Adapter 1 -> Advanced -> Port + Forwarding, add: + ssh TCP host 2222 -> guest 22 + http TCP host 8080 -> guest 80 + https TCP host 8443 -> guest 443 + 4. Start. SSH key + access details in $BUNDLE_DIR/README.md +================================================================ +EOF + +if [[ -z "${KEEP_CLUSTER:-}" ]]; then + stage "tearing down build-side cluster (set KEEP_CLUSTER=1 to keep it)" + "$Y_CLUSTER" teardown -c "$CFG_DIR" 2>/dev/null || true # y-script-lint:disable=or-true # cleanup best-effort +fi diff --git a/scripts/appliance-publish-hetzner.sh b/scripts/appliance-publish-hetzner.sh new file mode 100755 index 0000000..71a85e2 --- /dev/null +++ b/scripts/appliance-publish-hetzner.sh @@ -0,0 +1,312 @@ +#!/usr/bin/env bash +# Idempotently ensure a Hetzner Object Storage bucket exists, +# configured to allow public GET on individual objects but NOT +# bucket listing, then upload a single file and print its public +# URL. +# +# Use case: the operator runs scripts/appliance-build-virtualbox.sh +# to produce a VMDK bundle, then this script to publish the +# bundle (or a tarball of it) at a URL their test host can curl +# while staying anonymous. +# +# Hetzner Object Storage is S3-compatible; we shell out to the +# AWS CLI pointed at https://.your-objectstorage.com. +# If `aws` is not installed locally we run the official image +# via docker, which is universally available on dev machines. +# +# Credentials live in $H_S3_ENV_FILE (set in .env or shell env; +# typically the same file that holds HCLOUD_TOKEN). The file +# should set: +# H_S3_ACCESS_KEY= Object +# Storage -> Credentials> +# H_S3_SECRET_KEY=... +# H_S3_REGION=fsn1 # or hel1 / nbg1 +# H_S3_BUCKET=... # default bucket (script arg overrides) +# +# These are SEPARATE from HCLOUD_TOKEN: Object Storage is +# managed under the same project but the API uses dedicated +# S3 access/secret keys, not the Cloud API token. We co-locate +# them in the same env file because they share a project, not +# because they share an auth scheme. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='appliance-publish-hetzner.sh - upload a file to a Hetzner Object Storage bucket with public-read on objects (no listing) + +Usage: appliance-publish-hetzner.sh [object-key] + +Positional: + file Local path to upload + object-key Key to write under in the bucket (default: basename of file) + +Environment: + H_S3_ENV_FILE Path to env file with H_S3_* vars (set in .env or shell env; required) + H_S3_BUCKET Bucket name; overrides the env file. Required if not in env file. + H_S3_REGION Region; overrides the env file (fsn1, hel1, or nbg1). + AWS_CLI How to invoke aws. Default: local `aws` if on PATH, + else `docker run --rm -i public.ecr.aws/aws-cli/aws-cli`. + +Examples: + # publish a fresh appliance bundle + ./scripts/appliance-publish-hetzner.sh \ + dist/appliance-virtualbox/appliance-virtualbox-*/appliance-virtualbox.vmdk + + # publish under a custom key + ./scripts/appliance-publish-hetzner.sh appliance.tar.gz releases/2026-05-01/appliance.tar.gz + +Dependencies: + curl, and one of: locally-installed `aws` (preferred) OR `docker` + (used to invoke public.ecr.aws/aws-cli/aws-cli when aws is missing) +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; + "") echo "$YHELP" >&2; exit 2 ;; +esac + +INPUT="$1" +KEY_OVERRIDE="${2:-}" + +stage() { printf '\n=== %s ===\n' "$*"; } + +if [[ ! -e "$INPUT" ]]; then + echo "path not found: $INPUT" >&2 + exit 1 +fi + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${H_S3_ENV_FILE:?set H_S3_ENV_FILE in .env or shell env}" +ENV_FILE="$H_S3_ENV_FILE" +if [[ -f "$ENV_FILE" ]]; then + # shellcheck disable=SC1090 + set -a; . "$ENV_FILE"; set +a +else + echo "credentials file not found: $ENV_FILE" >&2 + cat >&2 </object-storage/credentials +EOF + exit 1 +fi + +: "${H_S3_ACCESS_KEY:?H_S3_ACCESS_KEY not set in $ENV_FILE}" +: "${H_S3_SECRET_KEY:?H_S3_SECRET_KEY not set in $ENV_FILE}" +: "${H_S3_REGION:?H_S3_REGION not set in $ENV_FILE (fsn1, hel1, or nbg1)}" +: "${H_S3_BUCKET:?H_S3_BUCKET not set; pass via env or env file}" + +BUCKET="$H_S3_BUCKET" +REGION="$H_S3_REGION" +ENDPOINT="https://${REGION}.your-objectstorage.com" + +# === Decide what to upload === +# Two modes: +# bundle - INPUT is a directory that looks like a y-cluster +# bundle (or a file inside one, identified by a +# sibling README.md). We tar `-C parent dirname` so +# the tarball extracts to a sibling directory in the +# customer's CWD: `tar xzf .tgz` produces +# `.//{README.md, *.vmdk, *-ssh, *-ssh.pub}`. +# single - INPUT is a regular file with no bundle context. +# Upload as-is. Key defaults to its basename. +# Bundle mode is preferred whenever a README.md sits next to +# the disk file, so the operator can pass either the directory +# or the .vmdk and get the same bundle-tarball result. +SOURCE_FILE="" +KEY="" +BUNDLE_DIR="" + +if [[ -d "$INPUT" ]]; then + BUNDLE_DIR=$(realpath "$INPUT") +elif [[ -f "$INPUT" && -f "$(dirname "$INPUT")/README.md" ]]; then + BUNDLE_DIR=$(realpath "$(dirname "$INPUT")") +fi + +if [[ -n "$BUNDLE_DIR" ]]; then + bundle_name=$(basename "$BUNDLE_DIR") + bundle_parent=$(dirname "$BUNDLE_DIR") + # Write the tarball next to the bundle dir, NOT under /tmp. + # /tmp is tmpfs on most distros (~16 GB) and a 1.5 GiB + # appliance tarball easily exhausts it; bundle_parent is on + # the operator's chosen output volume where space matches + # the bundle size. + TGZ="$bundle_parent/.${bundle_name}.$$.tgz" + trap 'rm -f "$TGZ"' EXIT + stage "packing bundle $BUNDLE_DIR -> $TGZ" + tar -czf "$TGZ" -C "$bundle_parent" "$bundle_name" + SOURCE_FILE="$TGZ" + KEY="${KEY_OVERRIDE:-${bundle_name}.tgz}" +else + SOURCE_FILE="$INPUT" + KEY="${KEY_OVERRIDE:-$(basename "$INPUT")}" +fi + +PUBLIC_URL="https://${BUCKET}.${REGION}.your-objectstorage.com/${KEY}" + +# === Pick an AWS CLI invocation === +# Prefer a local `aws` to avoid pulling a 200MB image on every +# run; fall back to docker so a fresh dev box doesn't have to +# install awscli first. +if [[ -n "${AWS_CLI:-}" ]]; then + : # operator override; trust it verbatim +elif command -v aws >/dev/null; then + AWS_CLI="aws" +elif command -v docker >/dev/null; then + # Mount /tmp because mktemp puts the policy + tarball there; + # mount $HOME so absolute paths under $HOME (typical y-cluster + # cache locations) resolve inside the container; -w $PWD + + # -v $PWD:$PWD lets relative paths the operator typed work. + AWS_CLI="docker run --rm -i \ + -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION \ + -v $HOME:$HOME -v $PWD:$PWD -v /tmp:/tmp -w $PWD \ + public.ecr.aws/aws-cli/aws-cli" +else + echo "neither 'aws' nor 'docker' found; install one or set AWS_CLI" >&2 + exit 1 +fi + +export AWS_ACCESS_KEY_ID="$H_S3_ACCESS_KEY" +export AWS_SECRET_ACCESS_KEY="$H_S3_SECRET_KEY" +export AWS_DEFAULT_REGION="$REGION" + +aws_s3api() { + # shellcheck disable=SC2086 + $AWS_CLI s3api --endpoint-url "$ENDPOINT" "$@" +} +aws_s3() { + # shellcheck disable=SC2086 + $AWS_CLI s3 --endpoint-url "$ENDPOINT" "$@" +} + +# === Ensure bucket exists === +# head-bucket exits 0 if the bucket exists and we have access, +# nonzero with stderr "Not Found" / "Forbidden" otherwise. We +# only auto-create on Not Found; Forbidden means a name clash +# in another tenant and the operator should pick a different +# bucket name. +stage "checking bucket s3://$BUCKET (endpoint: $ENDPOINT)" +head_err=$(mktemp) +trap 'rm -f "$head_err"' EXIT +if aws_s3api head-bucket --bucket "$BUCKET" 2>"$head_err"; then + echo " bucket exists" +else + if grep -qiE '404|Not Found|NoSuchBucket' "$head_err"; then + stage "creating bucket s3://$BUCKET" + # Hetzner rejects LocationConstraint=us-east-1 (the + # AWS-CLI default for create-bucket without + # --create-bucket-configuration). Hetzner-region values + # work as the LocationConstraint. + aws_s3api create-bucket \ + --bucket "$BUCKET" \ + --create-bucket-configuration "LocationConstraint=$REGION" + else + echo "head-bucket failed and not a 404:" >&2 + cat "$head_err" >&2 + exit 1 + fi +fi + +# === Apply public-read-on-objects, no-listing policy === +# This is the "anonymous can curl any individual object whose +# key they already know, but cannot enumerate the bucket" +# pattern. We allow only s3:GetObject on the +# arn:aws:s3:::BUCKET/* resource; ListBucket on the bucket +# itself is omitted, so anonymous LIST is denied. +stage "applying public-read-objects policy" +policy_file=$(mktemp) +trap 'rm -f "$head_err" "$policy_file"' EXIT +cat > "$policy_file" </dev/null || true # y-script-lint:disable=or-true # not all S3-compat backends implement put-public-access-block; policy alone is sufficient on Hetzner + +# === Upload === +stage "uploading $SOURCE_FILE -> s3://$BUCKET/$KEY" +size=$(stat -c '%s' "$SOURCE_FILE") +echo " size: $size bytes ($(numfmt --to=iec-i --suffix=B "$size" 2>/dev/null || echo "$size B"))" + +# `aws s3 cp` handles multipart for >8MB by default and prints +# a progress bar to stderr; preferred over `s3api put-object` +# for arbitrary-sized files (qcow2 / vmdk are easily >5GB). +aws_s3 cp "$SOURCE_FILE" "s3://$BUCKET/$KEY" + +# === Verify the object is anonymously reachable === +# Use a fresh curl with no creds to confirm the policy actually +# took effect; surfaces config drift (e.g. another script +# overwriting the bucket policy) at publish time, not at +# customer-download time. +stage "verifying anonymous GET" +http_code=$(curl -sI -o /dev/null -w '%{http_code}' "$PUBLIC_URL") +if [[ "$http_code" != "200" ]]; then + echo "anonymous GET returned HTTP $http_code (expected 200)" >&2 + echo "URL: $PUBLIC_URL" >&2 + exit 1 +fi +echo " anonymous GET HTTP 200" + +# === Verify the bucket is NOT anonymously listable === +list_code=$(curl -sI -o /dev/null -w '%{http_code}' "https://${BUCKET}.${REGION}.your-objectstorage.com/") +case "$list_code" in + 403) echo " anonymous LIST denied (HTTP 403): correct" ;; + 200) echo "WARNING: anonymous LIST returned HTTP 200; the bucket is enumerable. Check the policy." >&2 ;; + *) echo " anonymous LIST returned HTTP $list_code" ;; +esac + +cat < y-cluster prepare-export (virt-sysprep +# identity reset + timesyncd flip + netplan generic match). +# 4. y-cluster export --format=gcp-tar -- packs the qcow2 +# into .tar.gz containing a single disk.raw, the +# shape Compute Engine custom images expect. +# 5. PROMPT 2: confirm before any GCP-side write happens. +# 6. Upload tarball to GCS (creates bucket on first run). +# 7. gcloud compute images create from the GCS object +# (direct, no Cloud Build). +# 8. gcloud compute firewall-rules create (idempotent) for +# tcp:80 + tcp:443 on tagged instances. +# 9. gcloud compute instances create from the new image, +# tagged for the firewall rule. +# 10. Wait for ssh + probe HTTP. Print connection details. +# +# Aborting at PROMPT 1 leaves the local cluster running. +# Aborting at PROMPT 2 leaves the local bundle written but +# nothing in GCP. +# +# Every gcloud invocation passes --project=$GCP_PROJECT +# explicitly. Auth is the service-account JSON pointed at by +# $GOOGLE_APPLICATION_CREDENTIALS (created by +# scripts/gcp-bootstrap-credentials.sh). + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='appliance-qemu-to-gcp.sh - local provision -> hands-on -> export -> ship to GCP + +Usage: + appliance-qemu-to-gcp.sh build + ship to GCP + appliance-qemu-to-gcp.sh teardown delete VM + image + GCS object + appliance-qemu-to-gcp.sh teardown --delete-data-disk + also delete the persistent + /data/yolean disk (DESTRUCTIVE) + +Teardown reads GCP_PROJECT / GCP_ZONE / GCP_BUCKET / VM_NAME / +GCP_DATADIR_DISK / NAME from the same env vars as the build +flow. Custom images and GCS objects are deleted by NAME prefix +(so different NAMEs in the same project do not clobber each +other). The persistent data disk, the bucket itself, and the +firewall rule are preserved unless --delete-data-disk is set. +Local cluster cleanup (if KEEP_LOCAL was set) is separate: +y-cluster teardown -c \$CFG_DIR. + +Environment: + GCP_PROJECT GCP project (set in .env or shell env; required) + GCP_REGION GCP region (default: europe-north2 -- Stockholm) + GCP_ZONE GCP zone (default: europe-north2-a) + GCP_BUCKET GCS bucket for image tarballs + (default: -appliance-images) + GCP_MACHINE_TYPE Compute Engine machine type (default: e2-medium) + GCP_IMAGE_FAMILY Image family tag (default: y-cluster-appliance) + GCP_DATADIR_DISK Persistent disk for /data/yolean + (default: appliance-gcp-datadir; preserved on teardown) + GCP_DATADIR_SIZE Persistent disk size (default: 10GB; only used on create) + GCP_KEY Service account JSON (set in .env or shell env; required) + NAME Local cluster name (default: appliance-gcp-build). + Used as the prefix for the deliverable directory. + KUBECTX kubectl context name (default: local). Script + bails if a context with this name already + exists in your kubeconfig -- set KUBECTX to + something else, or delete the existing one. + IMAGE_NAME Custom image name in GCE (default: -) + VM_NAME Compute Engine VM name (default: $NAME) + APP_HTTP_PORT Local host port -> guest 80 (default: 39080) + APP_API_PORT Local host port -> guest 6443 (default: 39443) + APP_SSH_PORT Local host port -> guest 22 (default: 2229) + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) + KEEP_LOCAL Set to keep the local cluster after upload (default: tear down) + KEEP_BUNDLE Set to keep the local export bundle (default: keep -- bundle path printed) + ASSUME_YES Skip BOTH confirmations and proceed end-to-end + +Dependencies: + go, qemu-system-x86_64, qemu-img, kubectl, ssh, ssh-keygen, curl, + virt-sysprep, gcloud +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${GCP_PROJECT:?set GCP_PROJECT in .env or shell env}" +: "${GCP_KEY:?set GCP_KEY in .env or shell env}" + +GCP_REGION="${GCP_REGION:-europe-north2}" +GCP_ZONE="${GCP_ZONE:-europe-north2-a}" +GCP_BUCKET="${GCP_BUCKET:-${GCP_PROJECT}-appliance-images}" +GCP_MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-medium}" +GCP_IMAGE_FAMILY="${GCP_IMAGE_FAMILY:-y-cluster-appliance}" +GCP_DATADIR_DISK="${GCP_DATADIR_DISK:-appliance-gcp-datadir}" +GCP_DATADIR_SIZE="${GCP_DATADIR_SIZE:-10GB}" + +NAME="${NAME:-appliance-gcp-build}" +KUBECTX="${KUBECTX:-local}" +IMAGE_NAME="${IMAGE_NAME:-${NAME}-$(date -u +%Y%m%d-%H%M%S)}" +VM_NAME="${VM_NAME:-$NAME}" +APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_API_PORT="${APP_API_PORT:-39443}" +APP_SSH_PORT="${APP_SSH_PORT:-2229}" + +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" +CFG_DIR="${CFG_DIR:-$HOME/.cache/y-cluster-appliance-build/$NAME}" +# Top-level deliverable dir. Holds two per-format subdirs -- +# `gcp-tar/` (uploaded to Compute Engine here) and `ova/` +# (handed to a customer for VirtualBox / VMware Import +# Appliance). Both subdirs are byte-equivalent disk states; +# the only differences are the on-the-wire format and the +# README boot instructions. +BUNDLE_DIR="${BUNDLE_DIR:-$REPO_ROOT/dist/appliance/$NAME-$(date -u +%Y%m%dT%H%M%SZ)}" + +stage() { printf '\n=== %s ===\n' "$*"; } +confirm() { + local prompt=$1 + if [[ -n "${ASSUME_YES:-}" ]]; then + echo "ASSUME_YES set; proceeding ($prompt)" + return 0 + fi + read -r -p "$prompt [y/N] " answer + case "${answer,,}" in + y|yes) return 0 ;; + *) return 1 ;; + esac +} + +# do_teardown deletes GCP resources owned by this script's +# NAME prefix in the configured project + zone. Reads the +# same env vars as the build flow so a teardown after a +# customised build (e.g., NAME=customer-foo) cleans up +# exactly that customer's resources without touching other +# NAMEs that share the same project. +do_teardown() { + local delete_data_disk=0 + while [[ $# -gt 0 ]]; do + case "$1" in + --delete-data-disk) delete_data_disk=1 ;; + *) echo "unknown teardown flag: $1" >&2; exit 2 ;; + esac + shift + done + + stage "inventory in $GCP_PROJECT / $GCP_ZONE" + local vm images objects disk + vm=$(gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --format='value(name)' 2>/dev/null) \ + || true # y-script-lint:disable=or-true # missing VM is not an error + images=$(gcloud compute images list \ + --project="$GCP_PROJECT" \ + --no-standard-images \ + --filter="name~^${NAME}-" \ + --format='value(name)' 2>/dev/null) \ + || true # y-script-lint:disable=or-true # empty list is not an error + objects=$(gcloud storage ls "gs://$GCP_BUCKET/${NAME}-*.tar.gz" \ + --project="$GCP_PROJECT" 2>/dev/null) \ + || true # y-script-lint:disable=or-true # missing bucket / no objects is not an error + disk=$(gcloud compute disks describe "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --format='value(name)' 2>/dev/null) \ + || true # y-script-lint:disable=or-true # missing disk is not an error + + echo + echo "Will DELETE:" + [[ -n "$vm" ]] && echo " VM: $VM_NAME ($GCP_ZONE)" + if [[ -n "$images" ]]; then + echo "$images" | sed 's/^/ Image: /' + fi + if [[ -n "$objects" ]]; then + echo "$objects" | sed 's|^| GCS object: |' + fi + if [[ $delete_data_disk -eq 1 && -n "$disk" ]]; then + echo " Data disk: $GCP_DATADIR_DISK (PERSISTENT DATA WILL BE LOST)" + fi + echo + echo "Will PRESERVE:" + if [[ $delete_data_disk -eq 0 && -n "$disk" ]]; then + echo " Data disk: $GCP_DATADIR_DISK (--delete-data-disk to also remove)" + fi + echo " GCS bucket: gs://$GCP_BUCKET (objects matching $NAME-* deleted above)" + echo " Firewall rule: y-cluster-appliance-public (tag-based, shared)" + echo + + if [[ -z "$vm" && -z "$images" && -z "$objects" ]] \ + && { [[ $delete_data_disk -eq 0 ]] || [[ -z "$disk" ]]; }; then + echo "Nothing to delete." + exit 0 + fi + + confirm "Proceed with teardown?" \ + || { echo "aborted; nothing deleted."; exit 0; } + + if [[ -n "$vm" ]]; then + stage "deleting VM $VM_NAME" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + fi + if [[ -n "$images" ]]; then + stage "deleting custom images ($(echo "$images" | wc -l))" + # shellcheck disable=SC2086 + echo "$images" | xargs -r -I{} \ + gcloud compute images delete {} --project="$GCP_PROJECT" --quiet + fi + if [[ -n "$objects" ]]; then + stage "deleting GCS objects ($(echo "$objects" | wc -l))" + # shellcheck disable=SC2086 + echo "$objects" | xargs -r \ + gcloud storage rm --project="$GCP_PROJECT" + fi + if [[ $delete_data_disk -eq 1 && -n "$disk" ]]; then + stage "deleting persistent data disk $GCP_DATADIR_DISK" + gcloud compute disks delete "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + fi + + stage "teardown complete" +} + +# Minimal pre-checks shared by build and teardown: gcloud +# binary + GCP key + activation. The build flow does +# additional tool checks below the dispatch. +command -v gcloud >/dev/null \ + || { echo "missing required tool: gcloud" >&2; exit 1; } + +if [[ ! -f "$GCP_KEY" ]]; then + echo "missing GCP key: $GCP_KEY" >&2 + echo "create it with: scripts/gcp-bootstrap-credentials.sh on a machine with gcloud Owner access" >&2 + exit 1 +fi +export GOOGLE_APPLICATION_CREDENTIALS="$GCP_KEY" + +# Acknowledge parallel composite uploads up front. The setting +# both turns on multi-stream uploads (which is what we want for +# 1.5+ GiB tarballs) AND silences the WARNING stanza gcloud +# would otherwise emit on every `storage cp`. Env-var form so +# we don't mutate the operator's gcloud config. +export CLOUDSDK_STORAGE_PARALLEL_COMPOSITE_UPLOAD_ENABLED=True + +stage "activating GCP service account ($GCP_KEY)" +gcloud auth activate-service-account --key-file="$GCP_KEY" --project="$GCP_PROJECT" >/dev/null + +# Subcommand dispatch. Teardown only needs gcloud + GCP_KEY, +# both verified above; doesn't need go / qemu-img / etc. so +# the build-flow tool check below stays out of its path. +if [[ "${1:-}" = "teardown" ]]; then + shift + do_teardown "$@" + exit 0 +fi + +# Build-flow tool check (additional to gcloud above). +for tool in go qemu-system-x86_64 qemu-img kubectl ssh ssh-keygen curl virt-sysprep; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +# virt-sysprep needs to read /boot/vmlinuz-* (libguestfs supermin). +if ! [ -r /boot/vmlinuz-"$(uname -r)" ]; then + cat >&2 < $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +mkdir -p "$CFG_DIR" +cat > "$CFG_DIR/y-cluster-provision.yaml" </dev/null | grep -Fxq "$KUBECTX"; then + echo "kubectl context '$KUBECTX' already exists and is not owned by this script." >&2 + echo " Either remove it: kubectl config delete-context $KUBECTX" >&2 + echo " Or pick a new name: KUBECTX=appliance-qa $0" >&2 + exit 1 +fi + +stage "provisioning $NAME (k3s + Envoy Gateway)" +"$Y_CLUSTER" provision -c "$CFG_DIR" + +# Echo is what creates the Gateway listener (not just the +# Envoy Gateway controller -- the actual Gateway resource that +# binds :80). Without it, any HTTPRoute the operator applies +# in the hands-on window has nothing to attach to and curl +# returns "connection refused" both locally and on the eventual +# GCP VM. Auto-install so the Gateway listener is up by default; +# operators can still delete + replace echo with their own +# workload (the Gateway listener stays, the routing changes). +stage "installing echo workload (Gateway listener + baseline route)" +"$Y_CLUSTER" echo render \ + | kubectl --context="$KUBECTX" apply --server-side --field-manager=appliance-build -f - +kubectl --context="$KUBECTX" -n y-cluster wait \ + --for=condition=Available deployment/echo --timeout=180s + +# === Stage 2: hands-on prompt === +SSH_KEY="$CACHE_DIR/$NAME-ssh" +cat <-ssh{,.pub}` lands in both subdirs; +# the pair is identical (one keypair was generated at +# provision time, both export passes copy from the same +# source under $CACHE_DIR). +mkdir -p "$BUNDLE_DIR" + +stage "exporting Compute Engine image format -> $BUNDLE_DIR/gcp-tar" +"$Y_CLUSTER" export --context="$KUBECTX" --format=gcp-tar "$BUNDLE_DIR/gcp-tar" + +stage "exporting OVA (VirtualBox / VMware Import Appliance) -> $BUNDLE_DIR/ova" +"$Y_CLUSTER" export --context="$KUBECTX" --format=ova "$BUNDLE_DIR/ova" + +ls -lh "$BUNDLE_DIR"/*/ +TARBALL="$BUNDLE_DIR/gcp-tar/$NAME.tar.gz" + +# === Stage 4: confirm before any GCP write === +cat </dev/null || stat -c '%s' "$TARBALL") + +Next: upload to gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz, create a +GCE custom image, ensure firewall opens tcp:80 + tcp:443 on +tagged VMs, create $VM_NAME ($GCP_MACHINE_TYPE in $GCP_ZONE) +from the image. Aborting now leaves the bundle on local disk +unchanged. +================================================================ + +EOF + +confirm "Upload $TARBALL to GCS and create VM in $GCP_PROJECT?" \ + || { echo "aborted; bundle preserved at $BUNDLE_DIR."; exit 0; } + +# === Stage 5: GCS bucket (idempotent) === +stage "ensuring GCS bucket gs://$GCP_BUCKET (location $GCP_REGION)" +if ! gcloud storage buckets describe "gs://$GCP_BUCKET" --project="$GCP_PROJECT" >/dev/null 2>&1; then + gcloud storage buckets create "gs://$GCP_BUCKET" \ + --project="$GCP_PROJECT" \ + --location="$GCP_REGION" \ + --uniform-bucket-level-access +else + echo " bucket exists" +fi + +# === Stage 6: upload tarball === +stage "uploading $TARBALL -> gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" +gcloud storage cp "$TARBALL" "gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" --project="$GCP_PROJECT" + +# === Stage 7: create custom image === +stage "creating GCE custom image $IMAGE_NAME (family $GCP_IMAGE_FAMILY)" +gcloud compute images create "$IMAGE_NAME" \ + --project="$GCP_PROJECT" \ + --source-uri="gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" \ + --family="$GCP_IMAGE_FAMILY" \ + --architecture=X86_64 \ + >/dev/null + +# === Stage 8: firewall rule (idempotent) === +FIREWALL_RULE="y-cluster-appliance-public" +stage "ensuring firewall rule $FIREWALL_RULE (tcp:80,443 -> y-cluster-appliance tag)" +if ! gcloud compute firewall-rules describe "$FIREWALL_RULE" --project="$GCP_PROJECT" >/dev/null 2>&1; then + gcloud compute firewall-rules create "$FIREWALL_RULE" \ + --project="$GCP_PROJECT" \ + --direction=INGRESS \ + --network=default \ + --action=ALLOW \ + --rules=tcp:80,tcp:443 \ + --target-tags=y-cluster-appliance \ + --source-ranges=0.0.0.0/0 \ + >/dev/null +else + echo " rule exists" +fi + +# === Stage 8.5: ensure persistent data disk === +# Persistent disk attached to the VM and mounted at /data/yolean +# (the bundled local-path-provisioner's default storage root). +# Survives instance redeploys: tear down the VM, redeploy with a +# fresh image, the same /data/yolean comes back. Disk auto-delete +# is OFF when attaching an existing disk via --disk=name=, so +# `instances delete` won't wipe it. +stage "ensuring persistent data disk $GCP_DATADIR_DISK (size only used on create: $GCP_DATADIR_SIZE)" +if gcloud compute disks describe "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + echo " disk exists -- reusing (data preserved from previous deploy)" +else + gcloud compute disks create "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --size="$GCP_DATADIR_SIZE" \ + --type=pd-balanced \ + >/dev/null + echo " disk created (fresh; will be ext4-formatted on first mount)" +fi + +# === Stage 9: create VM (delete first if exists for idempotency) === +stage "creating $VM_NAME ($GCP_MACHINE_TYPE in $GCP_ZONE) from image $IMAGE_NAME" +if gcloud compute instances describe "$VM_NAME" --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + echo " $VM_NAME exists, deleting first" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null +fi +# device-name=datadir is what GCE writes after the +# `scsi-0Google_PersistentDisk_` prefix in /dev/disk/by-id/ +# inside the VM; the SSH-side mount block uses that stable path +# regardless of /dev/sd* enumeration order. +gcloud compute instances create "$VM_NAME" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --machine-type="$GCP_MACHINE_TYPE" \ + --image="$IMAGE_NAME" \ + --image-project="$GCP_PROJECT" \ + --boot-disk-size=20GB \ + --disk="name=$GCP_DATADIR_DISK,device-name=datadir,mode=rw,boot=no" \ + --tags=y-cluster-appliance \ + >/dev/null + +PUBLIC_IP=$(gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --format='get(networkInterfaces[0].accessConfigs[0].natIP)') +echo " public ip: $PUBLIC_IP" + +# === Stage 10: wait for ssh + probe === +# SSH_KEY (from CACHE_DIR) was used by the local cluster but is +# wiped by `y-cluster teardown` at the end of this flow. The +# bundle-dir copy is what the operator can reach the GCP VM +# with afterwards. Switch to the bundle path BEFORE teardown +# runs so subsequent prints reference the path that'll exist. +SSH_KEY="$BUNDLE_DIR/gcp-tar/$NAME-ssh" +SSH_OPTS="-i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" +echo " waiting for ssh on $PUBLIC_IP:22 (cloud-init can take 30-90s on first boot)" +ssh_up=0 +for i in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS -p 22 ystack@"$PUBLIC_IP" 'true' 2>/dev/null; then + echo " ssh up after $i attempt(s)" + ssh_up=1 + break + fi + echo " ssh attempt $i/60: not yet" + sleep 5 +done +if [[ $ssh_up -eq 0 ]]; then + echo "ssh on $PUBLIC_IP never came up; VM left running for diagnosis" >&2 + echo " delete with: gcloud compute instances delete $VM_NAME --project=$GCP_PROJECT --zone=$GCP_ZONE" >&2 + exit 1 +fi + +# === Stage 10.5: mount the persistent disk at /data/yolean === +# The appliance disk doesn't carry GCE guest-tools and our +# prepare_inguest pinned cloud-init to NoCloud only, so we can't +# mount via cloud-init mounts/ or via google-startup-scripts. +# We SSH in and do it directly: +# - format the disk if it has no filesystem (fresh disk) +# - persist the mount via fstab UUID for subsequent reboots +# - mount now +# - restart k3s so it re-discovers /data/yolean (k3s started +# before the mount existed; existing PVs would have mapped +# to empty paths on the root FS until restart) +stage "mounting $GCP_DATADIR_DISK at /data/yolean and restarting k3s" +# shellcheck disable=SC2087 +ssh $SSH_OPTS ystack@"$PUBLIC_IP" 'sudo bash -s' <<'REMOTE' +set -eu +# /dev/disk/by-id/google- requires google-guest-agent, +# which only ships in Google's own GCE images. Our appliance is +# built from the upstream Ubuntu cloud image, so we get the +# kernel-provided SCSI udev path instead: +# /dev/disk/by-id/scsi-0Google_PersistentDisk_ +# `` is what we passed to `gcloud --disk=device-name=datadir`, +# so the path is fully deterministic. We try both shapes -- SCSI +# first (matches the current appliance) and the guest-agent shape +# as a fallback for a future build that does install the agent. +MOUNT=/data/yolean +DEVICE="" +for cand in /dev/disk/by-id/scsi-0Google_PersistentDisk_datadir /dev/disk/by-id/google-datadir; do + for _ in $(seq 1 30); do + if [ -b "$cand" ]; then + DEVICE="$cand" + break 2 + fi + sleep 1 + done +done +[ -n "$DEVICE" ] || { echo "datadir disk never appeared at any expected /dev/disk/by-id/ path" >&2; exit 1; } +echo "datadir: $DEVICE" + +# Format with the label that matches the appliance's pre-baked +# fstab entry (LABEL=y-cluster-data /data/yolean ext4 ...). +# Using a different label, or adding a UUID-based fstab line, +# would either skip the pre-bake mount or duplicate it -- we +# want the LABEL line to be the one that fires at boot. +if ! blkid "$DEVICE" >/dev/null 2>&1; then + mkfs.ext4 -F -L y-cluster-data "$DEVICE" +fi +# Idempotent label enforcement: re-running this script against a +# data disk that was formatted by a PREVIOUS version of the script +# (with a different label, e.g. `data-yolean`) would skip mkfs +# above (blkid finds an existing FS) and leave the wrong label in +# place. The appliance's pre-baked /etc/fstab matches by LABEL, so +# a wrong label means the boot-time mount silently no-ops and the +# seed gate fails. e2label is a no-op when the label is already +# correct, so applying it unconditionally is cheap insurance. +e2label "$DEVICE" y-cluster-data + +install -d -m 0755 "$MOUNT" +if ! mountpoint -q "$MOUNT"; then + mount "$MOUNT" +fi + +# At first boot the seed unit ran before this disk was formatted +# and mounted, so it failed the mount-required gate and k3s.service +# stayed down on its Requires=. Now that /data/yolean is a real +# mountpoint, restart the seed unit so it extracts the seed onto +# the customer's volume, then k3s. +systemctl reset-failed y-cluster-data-seed.service k3s.service +systemctl restart y-cluster-data-seed.service +systemctl restart k3s.service +REMOTE + +probe() { + local what=$1 url=$2 attempts=${3:-60} + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o /dev/null -w " $what HTTP %{http_code}\n" "$url"; then + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 10 + done + return 1 +} + +stage "probing http://$PUBLIC_IP -- whatever you applied locally" +# We don't know the operator's routes a priori; try the +# y-cluster-shipped echo path as a baseline. If their workload +# replaced echo, this fails and the operator curls their own +# route. +probe echo "http://$PUBLIC_IP/q/envoy/echo" 30 || \ + echo " (no echo route -- expected if your workload replaced y-cluster echo)" + +if [[ -z "${KEEP_LOCAL:-}" ]]; then + stage "tearing down local cluster (set KEEP_LOCAL=1 to keep it)" + "$Y_CLUSTER" teardown -c "$CFG_DIR" 2>/dev/null || true # y-script-lint:disable=or-true # cleanup best-effort +fi + +cat < /data/yolean (persistent) + Deliverable: $BUNDLE_DIR + ├── gcp-tar/ (uploaded to GCE, used for the + │ live $VM_NAME above) + └── ova/ (hand to a customer for VirtualBox / + VMware -- same disk state) + +Connect: + ssh -i $SSH_KEY ystack@$PUBLIC_IP + curl http://$PUBLIC_IP/ + +kubectl from your laptop (apiserver not externally exposed): + ssh -L 6443:127.0.0.1:6443 -N -i $SSH_KEY ystack@$PUBLIC_IP & + ssh -i $SSH_KEY ystack@$PUBLIC_IP sudo cat /etc/rancher/k3s/k3s.yaml \\ + > k3s-$VM_NAME.yaml + KUBECONFIG=k3s-$VM_NAME.yaml kubectl get nodes + +Teardown when done: + gcloud compute instances delete $VM_NAME --project=$GCP_PROJECT --zone=$GCP_ZONE + gcloud compute images delete $IMAGE_NAME --project=$GCP_PROJECT + gcloud storage rm gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz --project=$GCP_PROJECT + +Persistent data disk is PRESERVED on teardown so PVC data +survives across redeploys. Re-running this script reuses the +same /data/yolean. Delete it manually when you're truly done: + gcloud compute disks delete $GCP_DATADIR_DISK --project=$GCP_PROJECT --zone=$GCP_ZONE +================================================================ +EOF diff --git a/scripts/e2e-appliance-export-import.sh b/scripts/e2e-appliance-export-import.sh new file mode 100755 index 0000000..5d7ca14 --- /dev/null +++ b/scripts/e2e-appliance-export-import.sh @@ -0,0 +1,312 @@ +#!/usr/bin/env bash +# Round-trip an y-cluster appliance through the export/import contract: +# build with y-cluster, install a placeholder application via kubectl, +# prepare-export, stop, copy the qcow2, then boot a SECOND qemu +# instance against the copy with no y-cluster involvement (simulating +# the customer's IT importing on their hypervisor) and verify the +# application reaches a 200 from a fresh process. +# +# Why this exists: +# The "build a per-customer appliance, ship it, customer boots it" +# pathway has never been e2e-tested. The Hetzner Packer flow proved +# snapshot+clone works on Hetzner; it doesn't tell us whether a +# qcow2 produced locally boots cleanly elsewhere. This script is +# the missing test. +# +# Conventions: +# - The application is opaque to y-cluster. We use the echo +# manifest as a placeholder, but install it via `y-cluster echo +# render | kubectl apply -f -` -- the same shape the eventual +# per-customer install will use (kubectl / kustomize / helm +# against the live cluster). y-cluster has no `echo deploy`-like +# special case here. +# - The customer-side qemu invocation is bare bash. No y-cluster +# binary, no seed image, no cloud-init reattach. Just qemu-system +# against the exported qcow2 with new port forwards. If the +# appliance can't survive that, prepare-export has the bug. +# +# Stages: +# 1. Build the dev binary into ./dist (gitignored). +# 2. Provision an appliance (k3s + Envoy Gateway only) under a +# throwaway name. +# 3. Apply the placeholder app via kubectl. +# 4. Smoketest curl on the build-side host. +# 5. y-cluster stop + prepare-export. +# 6. y-cluster export to a bundle dir (flattened qcow2 + +# keypair + README). +# 7. Boot a fresh qemu against the BUNDLED qcow2 with new +# port forwards. The bundle has no backing-file dependency +# on y-cluster's cloud-image cache; this proves the disk is +# genuinely portable. +# 8. Wait for ssh + curl on the imported instance. +# 9. On failure, ssh in and dump k3s state for diagnosis. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='e2e-appliance-export-import.sh - local round-trip provision -> kubectl install -> prepare-export -> stop -> raw-qemu boot -> verify + +Usage: e2e-appliance-export-import.sh + +Environment: + NAME Appliance name (default: appliance-export-test) + APP_HTTP_PORT Build-side host port -> guest 80 (default: 39080) + APP_API_PORT Build-side host port -> guest 6443 (default: 39443) + APP_SSH_PORT Build-side host port -> guest 22 (default: 2229) + IMP_HTTP_PORT Import-side host port -> guest 80 (default: 39180) + IMP_SSH_PORT Import-side host port -> guest 22 (default: 2230) + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) + KEEP_BUILD Set to keep the build-side cluster after success (default: tear it down) + DEBUG Set non-empty for bash trace + +Dependencies: + go, qemu-system-x86_64, kubectl, ssh, ssh-keygen, curl, virt-sysprep (libguestfs-tools) + +Exit codes: + 0 Round-trip succeeded; imported instance answered the smoketest + 1 Any stage failed; build-side cluster left up for diagnosis +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +NAME="${NAME:-appliance-export-test}" +APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_API_PORT="${APP_API_PORT:-39443}" +APP_SSH_PORT="${APP_SSH_PORT:-2229}" +IMP_HTTP_PORT="${IMP_HTTP_PORT:-39180}" +IMP_SSH_PORT="${IMP_SSH_PORT:-2230}" + +REPO_ROOT="$(git rev-parse --show-toplevel)" +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" +EXPORT_DIR=$(mktemp -d -p /tmp e2e-export.XXXXXX) +CFG_DIR=$(mktemp -d -p /tmp e2e-config.XXXXXX) + +stage() { printf '\n=== %s ===\n' "$*"; } + +cleanup() { + set +e + if [[ -f "$EXPORT_DIR/imported.pid" ]]; then + local imp_pid + imp_pid=$(cat "$EXPORT_DIR/imported.pid" 2>/dev/null) + if [[ -n "$imp_pid" ]] && kill -0 "$imp_pid" 2>/dev/null; then + echo "stopping imported qemu (pid $imp_pid)" + kill -TERM "$imp_pid" 2>/dev/null # y-script-lint:disable=or-true # not relevant here + sleep 2 + kill -KILL "$imp_pid" 2>/dev/null # y-script-lint:disable=or-true # may already be gone + fi + fi +} +trap cleanup EXIT + +for tool in go qemu-system-x86_64 kubectl ssh ssh-keygen curl virt-sysprep; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +# virt-sysprep on Ubuntu fails before it touches the qcow2 if it +# can't read /boot/vmlinuz-* (libguestfs builds a tiny appliance VM +# with the host kernel via supermin). Ubuntu installs kernel images +# 0600 root, so non-root invocations bail with an opaque +# "supermin exited with error status 1". Surface the fix here. +if ! [ -r /boot/vmlinuz-"$(uname -r)" ]; then + cat >&2 < $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === 2. Provision the build-side appliance === +# Idempotent re-run: tear down any leftover from a prior failed run. +stage "tearing down any leftover $NAME cluster" +# We need the config in place for teardown to find the cluster, so +# write it BEFORE the teardown attempt. teardown is idempotent +# (no-op when the cluster doesn't exist) so re-entry is safe. +cat > "$CFG_DIR/y-cluster-provision.yaml" <&2 + rm -f "$out" + return 1 +} +probe echo "http://127.0.0.1:$APP_HTTP_PORT/q/envoy/echo" +probe s3 "http://127.0.0.1:$APP_HTTP_PORT/s3/health" + +# === 5. stop + prepare-export === +# y-cluster stop owns the graceful guest shutdown (ssh +# poweroff -> wait for qemu exit -> SIGTERM/SIGKILL fallback). +# Without that, qemu's SIGTERM exits in ~200ms and the guest's +# k3s/containerd state isn't flushed, leaving zero-byte +# overlayfs snapshot files on the qcow2 and "exec format error" +# crash loops on the imported boot. +stage "stopping cluster ($NAME)" +"$Y_CLUSTER" stop --context="$NAME" + +stage "prepare-export ($NAME)" +"$Y_CLUSTER" prepare-export --context="$NAME" + +# === 6. y-cluster export -> bundle dir === +# Produces a flattened, self-contained qcow2 (no backing file) +# plus the keypair plus a README. EXPORT_DIR was created by +# mktemp; the export subcommand refuses to write into a +# non-empty dir, so remove that dir and re-create it after the +# export. +BUNDLE_DIR="$EXPORT_DIR/bundle" +stage "exporting bundle to $BUNDLE_DIR (--format=qcow2)" +"$Y_CLUSTER" export --context="$NAME" --format=qcow2 "$BUNDLE_DIR" +ls -la "$BUNDLE_DIR/" +echo " qemu-img info on the bundled disk:" +qemu-img info "$BUNDLE_DIR/$NAME.qcow2" | grep -E '^(file format|virtual size|disk size|backing)' | sed 's/^/ /' + +# === 7. Customer-side: raw qemu against the bundled disk === +# No y-cluster involvement here -- just qemu-system-x86_64 +# pointed at the bundled qcow2 + the bundled key. This proves +# the bundle is genuinely self-contained: any host that can run +# qemu (with the cloud image NOT present at the build path) +# would boot it. +stage "booting bundled qcow2 via raw qemu (host ports $IMP_SSH_PORT -> :22, $IMP_HTTP_PORT -> :80)" +qemu-system-x86_64 \ + -name "$NAME-imported" \ + -machine accel=kvm -cpu host \ + -smp 2 -m 4096 \ + -drive "file=$BUNDLE_DIR/$NAME.qcow2,format=qcow2,if=virtio" \ + -netdev "user,id=n0,hostfwd=tcp::$IMP_SSH_PORT-:22,hostfwd=tcp::$IMP_HTTP_PORT-:80" \ + -device virtio-net-pci,netdev=n0 \ + -serial "file:$EXPORT_DIR/console.log" \ + -display none \ + -daemonize \ + -pidfile "$EXPORT_DIR/imported.pid" +echo " imported pid: $(cat "$EXPORT_DIR/imported.pid")" + +# === 8. Wait for SSH === +SSH_OPTS="-i $BUNDLE_DIR/$NAME-ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" +echo " waiting for ssh" +ssh_up=0 +for i in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS -p "$IMP_SSH_PORT" ystack@127.0.0.1 'true' 2>/dev/null; then + ssh_up=1 + echo " ssh up after $i tries" + break + fi + sleep 5 +done +if [[ $ssh_up -eq 0 ]]; then + echo "imported instance ssh never came up; console log:" >&2 + tail -50 "$EXPORT_DIR/console.log" >&2 + exit 1 +fi + +# === 9. Imported smoketest === +# Both endpoints must come back: echo (stateless) proves the +# Envoy Gateway data plane is up, /s3/health (StatefulSet against +# the local-path PV that lives on the appliance disk) proves the +# stateful workload survived the export -> bundle -> raw-qemu boot. +stage "imported-side smoketest: echo + s3" +imp_probe() { + local what=$1 url=$2 attempts=${3:-60} + local out + out=$(mktemp) + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o "$out" -w " $what HTTP %{http_code}\n" "$url"; then + echo + echo "=== imported $what response (head) ===" + head -25 "$out" + echo + rm -f "$out" + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 5 + done + rm -f "$out" + return 1 +} +if imp_probe echo "http://127.0.0.1:$IMP_HTTP_PORT/q/envoy/echo" \ + && imp_probe s3 "http://127.0.0.1:$IMP_HTTP_PORT/s3/health"; then + echo "=== success: round-trip works (echo + s3) ===" + echo " imported echo reachable at: http://127.0.0.1:$IMP_HTTP_PORT/q/envoy/echo" + echo " imported s3 reachable at: http://127.0.0.1:$IMP_HTTP_PORT/s3/health" + echo " imported ssh: ssh -p $IMP_SSH_PORT -i $BUNDLE_DIR/$NAME-ssh ystack@127.0.0.1" + echo " build-side cluster preserved (KEEP_BUILD=1) -- destroy with: $Y_CLUSTER teardown -c $CFG_DIR" + if [[ -z "${KEEP_BUILD:-}" ]]; then + "$Y_CLUSTER" teardown -c "$CFG_DIR" 2>/dev/null # y-script-lint:disable=or-true # success path cleanup + fi + exit 0 +fi + +# === Diagnosis on failure === +echo >&2 +echo "imported smoketest never returned 200. Diagnostics:" >&2 +# shellcheck disable=SC2086 +ssh $SSH_OPTS -p "$IMP_SSH_PORT" ystack@127.0.0.1 \ + 'echo ===nodes===; sudo k3s kubectl get nodes -o wide; + echo ===pods===; sudo k3s kubectl get pods -A; + echo ===k3s status===; systemctl is-active k3s; + echo ===listen===; sudo ss -tlnp | grep -E ":(80|443|6443)\b" + ' >&2 # y-script-lint:disable=or-true # diagnostic best-effort +echo " imported ssh: ssh -p $IMP_SSH_PORT -i $BUNDLE_DIR/$NAME-ssh ystack@127.0.0.1" >&2 +echo " console log: $EXPORT_DIR/console.log" >&2 +exit 1 diff --git a/scripts/e2e-appliance-hetzner.pkr.hcl b/scripts/e2e-appliance-hetzner.pkr.hcl new file mode 100644 index 0000000..d65181e --- /dev/null +++ b/scripts/e2e-appliance-hetzner.pkr.hcl @@ -0,0 +1,252 @@ +# Packer template that bakes a y-cluster appliance directly on +# Hetzner Cloud and saves it as a snapshot. Replaces the older +# dd-via-rescue path (qemu-img convert + zstd + dd /dev/sda from +# the rescue image) which broke at the "TCP/22 reachable, no SSH +# banner" stage we couldn't diagnose without a console. +# +# Why Packer + hcloud builder: +# - Hetzner's supported custom-image path is snapshots, not +# uploaded raw images. Building on Hetzner avoids the BIOS / +# partition table / network-driver mismatch you hit when you +# dd a qemu disk onto bare metal. +# - Packer's hcloud builder owns the lifecycle: spin a temporary +# server from a stock Ubuntu image, run provisioners over SSH, +# power off, snapshot, delete the temporary server. +# - The output (snapshot ID + name) feeds straight into +# `hcloud server create --image=` for fleet rollout. +# +# Local appliance vs Hetzner appliance: +# - Local dev still uses `y-cluster provision` against qemu and +# prepare-export when the operator wants a portable qcow2. +# - Production / customer Hetzner deploys go through this Packer +# template instead. +# - Both share the workload manifests (pkg/echo/template.yaml and +# the upstream Envoy Gateway install) by re-running the same +# `y-cluster echo deploy` invocation; only the VM lifecycle +# diverges. +# +# Required: HCLOUD_TOKEN in env, var.y_cluster_binary set to a +# linux/amd64 y-cluster build. The orchestrator script +# (e2e-appliance-hetzner.sh) supplies both. + +packer { + required_plugins { + hcloud = { + source = "github.com/hetznercloud/hcloud" + version = ">= 1.6" + } + } +} + +variable "hcloud_token" { + type = string + default = "${env("HCLOUD_TOKEN")}" + sensitive = true +} + +variable "snapshot_name" { + type = string + default = "y-cluster-appliance-{{timestamp}}" +} + +# cx23 = 2 vCPU / 4 GB RAM / 40 GB disk in hel1, ~€0.006/h. +# Hetzner retired cx22 / cpx21 in EU regions during 2026; the +# x86 shared lineup is now cx*3 / cpx*2 and cax* (Ampere arm). +variable "server_type" { + type = string + default = "cx23" +} + +variable "location" { + type = string + default = "hel1" +} + +variable "base_image" { + type = string + default = "ubuntu-24.04" +} + +variable "k3s_version" { + type = string + default = "v1.35.4+k3s1" +} + +# Tracks pkg/provision/envoygateway/version.go's Version constant. +# Kept independent here so `packer build` can be run against an +# older binary if needed; the orchestrator script does NOT pin +# them together to keep that flexibility. +variable "envoy_gateway_version" { + type = string + default = "v1.7.2" +} + +variable "y_cluster_binary" { + type = string + description = "Path to a linux/amd64 y-cluster binary to upload onto the build host" +} + +variable "prepare_script" { + type = string + description = "Path to pkg/provision/qemu/prepare_inguest.sh -- the shared identity-reset script that also runs against offline qcow2 disks via virt-customize" +} + +# Stable k3s node-name baked into the appliance. The build host's +# hostname is whatever Packer assigns (e.g. packer-XXXXXXXX); the +# customer's cloned server will end up with a different hostname +# (Hetzner sets it from the server name on first boot). Pinning +# K3S_NODE_NAME decouples k3s identity from the OS hostname, so +# the cloned server's k3s recognises the node entry baked into +# the snapshot's sqlite datastore. Without this pin, every cloned +# server registers a NEW node under its own hostname while the +# build-host node lingers as orphan, and every workload pod stays +# bound to the dead node. +variable "k3s_node_name" { + type = string + default = "appliance" +} + +variable "stateful_manifest" { + type = string + description = "Path to a pre-rendered single-file YAML for the appliance-stateful workload. Packer's file provisioner doesn't recursively upload directories cleanly across all builders, so the orchestrator script `kubectl kustomize`s testdata/appliance-stateful/base into a temp file and passes the path here." +} + +variable "localstorage_manifest" { + type = string + description = "Path to a pre-rendered local-path-provisioner manifest (output of `y-cluster localstorage render`). Same shape as stateful_manifest -- a host-rendered single yaml, applied via kubectl on the build VM." +} + +source "hcloud" "appliance" { + token = var.hcloud_token + image = var.base_image + location = var.location + server_type = var.server_type + ssh_username = "root" + snapshot_name = var.snapshot_name + snapshot_labels = { + purpose = "y-cluster-appliance" + } +} + +build { + sources = ["source.hcloud.appliance"] + + # Stage the y-cluster binary on the build host. Used here for + # `y-cluster echo deploy`; left on the appliance as a no-cost + # operator-inspection convenience. + provisioner "file" { + source = var.y_cluster_binary + destination = "/usr/local/bin/y-cluster" + } + + # Stage the shared identity-reset script. Same script runs on + # the qemu prepare-export path via virt-customize. Single + # source of truth for what the appliance disk looks like at + # snapshot time. + provisioner "file" { + source = var.prepare_script + destination = "/usr/local/bin/y-cluster-prepare" + } + + # Stage the stateful-workload manifest (VersityGW + # StatefulSet + Service + HTTPRoute + 1Gi local-path PVC). + # The file is a single rendered YAML produced by the + # orchestrator's `kubectl kustomize`, so this is a plain + # one-file scp -- no recursive directory upload, no Packer + # SSH-communicator quirks. + provisioner "file" { + source = var.stateful_manifest + destination = "/root/appliance-stateful.yaml" + } + + # Stage the bundled local-path-provisioner manifest + # (rendered by `y-cluster localstorage render` on the host). + # Replaces k3s's disabled local-storage addon with the + # appliance-shape defaults: path /data/yolean, predictable + # PVC namespace_name pattern, Retain reclaim. + provisioner "file" { + source = var.localstorage_manifest + destination = "/root/y-cluster-localstorage.yaml" + } + + # k3s install + workload + smoketest, all running normally. + # We run k3s during the build (no INSTALL_K3S_SKIP_START) so + # the snapshot includes a fully-converged cluster: kubeconfig, + # sqlite-resident workload state, pulled container images, + # everything. The cloned server's k3s recognises the node + # entry by K3S_NODE_NAME (baked in via /etc/systemd/system/ + # k3s.service.env) and resumes -- no orphan node, no first-boot + # manifests-dir reconcile loop, faster startup. + provisioner "shell" { + inline_shebang = "/bin/bash -eux" + environment_vars = [ + "K3S_VERSION=${var.k3s_version}", + "K3S_NODE_NAME=${var.k3s_node_name}", + "ENVOY_GATEWAY_VERSION=${var.envoy_gateway_version}", + "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", + ] + inline = [ + "cloud-init status --wait", + "chmod +x /usr/local/bin/y-cluster /usr/local/bin/y-cluster-prepare", + # Install + start. K3S_NODE_NAME comes from the + # environment_vars block above; the install script writes + # it into /etc/systemd/system/k3s.service.env so the + # cloned server's systemd-managed k3s reads it back on + # cold boot. + # --disable=local-storage: y-cluster ships its own + # local-path-provisioner via the y-cluster-localstorage.yaml + # applied below; k3s's bundled local-storage would otherwise + # reconcile our ConfigMap back to the upstream defaults. + "curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=\"$K3S_VERSION\" INSTALL_K3S_EXEC='--disable=traefik --disable=local-storage' sh -", + "until kubectl get nodes 2>/dev/null | grep -qE ' Ready '; do sleep 2; done", + # Bundled local-path-provisioner with appliance-shape + # defaults (path /data/yolean, predictable PVC + # namespace_name pattern, Retain reclaim). + "kubectl apply --server-side --field-manager=y-cluster -f /root/y-cluster-localstorage.yaml", + "kubectl --namespace=local-path-storage rollout status deployment/local-path-provisioner --timeout=120s", + # Envoy Gateway upstream install + the y-cluster GatewayClass. + "kubectl apply --server-side -f https://github.com/envoyproxy/gateway/releases/download/$ENVOY_GATEWAY_VERSION/install.yaml", + "kubectl wait --namespace=envoy-gateway-system --for=condition=Available deployments --all --timeout=180s", + "kubectl apply --server-side -f - <<'EOF'\napiVersion: gateway.networking.k8s.io/v1\nkind: GatewayClass\nmetadata:\n name: y-cluster\nspec:\n controllerName: gateway.envoyproxy.io/gatewayclass-controller\nEOF", + # Echo workload via the standard kubectl path -- y-cluster + # has no special case for the customer's app. + "/usr/local/bin/y-cluster echo deploy --context default", + "kubectl --namespace=y-cluster wait --for=condition=Available deployment/echo --timeout=120s", + # Stateful workload: VersityGW (S3-over-posix gateway) + # backed by a local-path PVC. Brings up the persistent- + # volume code path so the snapshot includes a + # provisioned PV directory under /var/lib/rancher/k3s/ + # storage, with the StatefulSet bound to it. Cloned + # servers' k3s recognises the same node-name (appliance) + # and rebinds the same PV directory -- no orphan, no + # re-provision. + "kubectl apply --server-side --field-manager=appliance-build -f /root/appliance-stateful.yaml", + "kubectl --namespace=appliance-stateful rollout status statefulset/versitygw --timeout=180s", + # In-VM smoketest: klipper-lb (k3s's bundled LoadBalancer + # controller) binds host port 80 on the node. Probe both + # the echo path and the s3 path so a build with a broken + # PVC, missing storage class, or mis-routed HTTPRoute + # fails at build time. + "for i in $(seq 1 60); do curl -fsS http://localhost/q/envoy/echo && break; sleep 2; done", + "for i in $(seq 1 60); do curl -fsS http://localhost/s3/health && break; sleep 2; done", + ] + } + + # Identity reset via the shared script. Runs in the live VM + # against /etc/cloud/cloud.cfg.d/, /etc/netplan/, log files, + # bash history, etc. Same script the qemu prepare-export + # runs offline; one source of truth. + # + # After the script, stop k3s gracefully so the snapshot + # captures a quiesced sqlite datastore. Packer's hcloud + # builder powers the VM off and snapshots after this + # provisioner returns. + provisioner "shell" { + inline_shebang = "/bin/bash -eux" + inline = [ + "/usr/local/bin/y-cluster-prepare", + "systemctl stop k3s", + "sync", + ] + } +} diff --git a/scripts/e2e-appliance-hetzner.sh b/scripts/e2e-appliance-hetzner.sh new file mode 100755 index 0000000..9fbc137 --- /dev/null +++ b/scripts/e2e-appliance-hetzner.sh @@ -0,0 +1,306 @@ +#!/usr/bin/env bash +# e2e: build a y-cluster appliance snapshot on Hetzner Cloud via +# Packer, boot a server from it, and verify the echo HTTPRoute +# answers over the public IP. +# +# Replaces the older dd-via-rescue path (qemu-img convert + zstd + +# dd /dev/sda from rescue mode) which broke at the "TCP/22 reachable, +# no SSH banner" stage we couldn't diagnose without out-of-band +# console. Packer's hcloud builder handles base-image / partition +# layout / network drivers natively, so the path "image boots on +# Hetzner" is no longer something we have to engineer ourselves -- +# we get it for free by building on Hetzner from the start. +# +# Local appliance vs Hetzner appliance: +# - Local dev still uses `y-cluster provision` against qemu and +# prepare-export when the operator wants a portable qcow2. +# - Production Hetzner deploys go through this script, which +# produces a reusable snapshot a fleet can clone from. +# +# Stages: +# 1. Build a current-arch y-cluster dev binary into ./dist (the +# Packer template uploads it onto the build host). +# 2. `packer init` + `packer build` of e2e-appliance-hetzner.pkr.hcl. +# Packer creates a temporary cx23 in hel1, runs the workload +# install, snapshots, and tears the temporary server down. +# 3. Resolve the snapshot ID from `hcloud image list`. +# 4. Create a fresh server from the snapshot (idempotent: deletes +# any matching $SERVER_NAME first). +# 5. Probe http:///q/envoy/echo until it answers. +# +# Prerequisites: +# - HCLOUD_TOKEN sourced from $ENV_FILE (set in .env or shell env) +# - hcloud CLI on PATH (apt install hcloud OR snap install hcloud) +# - packer on PATH (apt install packer after adding HashiCorp's +# repo, OR download from releases.hashicorp.com) +# - go (to build the dev binary), curl, ssh-keygen +# +# On success: prints the public IP and leaves the server running so +# the operator can poke at it. Teardown is manual: +# hcloud server delete $SERVER_NAME +# hcloud image delete # optional: snapshot is reusable +# The script is idempotent on re-run -- it deletes any matching +# server/key first and starts fresh from a new snapshot. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='e2e-appliance-hetzner.sh - Build a y-cluster appliance snapshot on Hetzner Cloud and verify it serves traffic + +Usage: e2e-appliance-hetzner.sh + +Environment: + HCLOUD_TOKEN Hetzner Cloud API token (sourced from ENV_FILE) + ENV_FILE Path to env file with HCLOUD_TOKEN (set in .env or shell env; required) + SERVER_NAME Server name to create (default: y-cluster-appliance-test) + SERVER_TYPE Hetzner server type (default: cx23) + SERVER_LOCATION Hetzner location (default: hel1) + SNAPSHOT_NAME Snapshot description used as Packer output name + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + DEBUG Set non-empty to enable bash trace + +Dependencies: + packer, hcloud, go, ssh, ssh-keygen, curl + +Exit codes: + 0 Success: appliance reachable on public IP + 1 Missing prereq, packer build failure, or echo never answered +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${ENV_FILE:?set ENV_FILE in .env or shell env}" + +if [[ ! -f "$ENV_FILE" ]]; then + echo "missing env file: $ENV_FILE" >&2 + echo "expected at minimum: HCLOUD_TOKEN=" >&2 + exit 1 +fi +# shellcheck disable=SC1090 +source "$ENV_FILE" +[[ -n "${HCLOUD_TOKEN:-}" ]] || { echo "HCLOUD_TOKEN not set in $ENV_FILE" >&2; exit 1; } +export HCLOUD_TOKEN + +# Tunables. Defaults match the Packer template's; override here when +# experimenting with alternate locations / instance types. +SERVER_NAME="${SERVER_NAME:-y-cluster-appliance-test}" +SERVER_TYPE="${SERVER_TYPE:-cx23}" +SERVER_LOCATION="${SERVER_LOCATION:-hel1}" +SNAPSHOT_NAME="${SNAPSHOT_NAME:-y-cluster-appliance-$(date -u +%Y%m%d-%H%M%S)}" + +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +PACKER_TEMPLATE="$REPO_ROOT/scripts/e2e-appliance-hetzner.pkr.hcl" + +for tool in packer hcloud go ssh ssh-keygen curl; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +stage() { printf '\n=== %s ===\n' "$*"; } + +confirm() { + local prompt=$1 + if [[ -n "${ASSUME_YES:-}" ]]; then + echo "ASSUME_YES set; proceeding ($prompt)" + return 0 + fi + read -r -p "$prompt [y/N] " answer + case "${answer,,}" in + y|yes) return 0 ;; + *) return 1 ;; + esac +} + +cat <<'WARN' + +================================================================ +DEPRECATION WARNING + +scripts/e2e-appliance-hetzner.sh's role is changing. + +Today this script tests the legacy "Hetzner-as-export-mode" +shape: build an appliance inside a Hetzner VM via Packer, +snapshot, boot a server from the snapshot. This shape doesn't +match the appliance contract (Hetzner Cloud has no public API +for uploading a locally-built disk). + +Replacement plan: + - This script will be REPURPOSED once a Hetzner PROVISIONER + exists in pkg/provision/hetzner/ (alongside qemu / docker / + multipass). Repurposed scope: end-to-end test of + `y-cluster provision -c hetzner.yaml` -> snapshot -> + instantiate-from-snapshot. The Packer-build half goes away; + the snapshot becomes a regular y-cluster lifecycle artefact. + - The local-build appliance contract is moving to + scripts/appliance-qemu-to-gcp.sh (Hetzner's API can't + accept a local disk; GCP's `gcloud compute images import` + can). + +This script still runs. It still passes. But its purpose is +about to flip; treat results from a green run today as +"Packer build still works" rather than "appliance contract +verified". +================================================================ + +WARN +confirm "Proceed with the legacy Packer e2e anyway?" \ + || { echo "aborted; no changes made."; exit 0; } + +# === 1. Build the dev binary the Packer template uploads === +stage "building linux/amd64 dev binary -> $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && GOOS=linux GOARCH=amd64 go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === 2. render stateful manifest + packer init + build === +# Packer's file provisioner doesn't recursively upload +# directories cleanly across all builder/communicator +# combinations (hcloud's SSH communicator scp's a directory +# argument as a single path and gets back "Is a directory"). +# Pre-render the kustomize base on the host into one yaml file +# and ship that single file to the build VM instead. Same end +# result, no scp recursion concerns. +# The fixture is split into two yconverge modules (namespace +# first, then the StatefulSet+Service+HTTPRoute) so the local +# convergence path can express the dep with a cue import. The +# Hetzner Packer flow doesn't run yconverge inside the build +# VM (would need the y-cluster binary on the VM) -- it stays +# kubectl-apply, but we render BOTH bases and concat. kubectl +# applies a Namespace ahead of namespaced resources in the +# same -f input, so a single concat'd file converges in the +# right order. +STATEFUL_MANIFEST=$(mktemp -t appliance-stateful.XXXXXX.yaml) +{ + kubectl kustomize "$REPO_ROOT/testdata/appliance-stateful/namespace" + echo '---' + kubectl kustomize "$REPO_ROOT/testdata/appliance-stateful/base" +} > "$STATEFUL_MANIFEST" + +# y-cluster's bundled local-path-provisioner manifest (replaces +# k3s's disabled local-storage). Rendered with the same defaults +# the Go-side provisioners install so an appliance built via +# Hetzner Packer ends up indistinguishable from one built locally. +LOCALSTORAGE_MANIFEST=$(mktemp -t y-cluster-localstorage.XXXXXX.yaml) +"$Y_CLUSTER" localstorage render > "$LOCALSTORAGE_MANIFEST" + +stage "packer init" +packer init "$PACKER_TEMPLATE" + +stage "packer build (creates a temporary $SERVER_TYPE in $SERVER_LOCATION, snapshots, deletes)" +packer build \ + -var "snapshot_name=$SNAPSHOT_NAME" \ + -var "server_type=$SERVER_TYPE" \ + -var "location=$SERVER_LOCATION" \ + -var "y_cluster_binary=$Y_CLUSTER" \ + -var "prepare_script=$REPO_ROOT/pkg/provision/qemu/prepare_inguest.sh" \ + -var "stateful_manifest=$STATEFUL_MANIFEST" \ + -var "localstorage_manifest=$LOCALSTORAGE_MANIFEST" \ + "$PACKER_TEMPLATE" + +# === 3. Resolve snapshot ID === +# Packer's hcloud builder prints the snapshot ID at the end of build +# but doesn't expose it in a stable machine-readable way without a +# manifest post-processor. hcloud image list is the simpler path. +stage "resolving snapshot id for $SNAPSHOT_NAME" +SNAPSHOT_ID=$(hcloud image list \ + --type=snapshot \ + --selector="purpose=y-cluster-appliance" \ + --output=json \ + | python3 -c " +import json, sys +images = json.load(sys.stdin) +matches = [i for i in images if i.get('description') == '$SNAPSHOT_NAME'] +if not matches: + sys.exit('no snapshot named $SNAPSHOT_NAME found') +print(matches[0]['id']) +") +echo " snapshot id: $SNAPSHOT_ID" + +# === 4. Create a fresh ssh keypair + server from the snapshot === +KEY_DIR=$(mktemp -d) +trap 'rm -rf "$KEY_DIR" "$STATEFUL_MANIFEST" "$LOCALSTORAGE_MANIFEST"' EXIT +ssh-keygen -t ed25519 -N '' -C "$SERVER_NAME-$$" -f "$KEY_DIR/id" -q +KEY_NAME="$SERVER_NAME" + +stage "tearing down any leftover server / key from a prior run" +hcloud server delete "$SERVER_NAME" 2>/dev/null || true # y-script-lint:disable=or-true # idempotent cleanup: missing server is not an error +hcloud ssh-key delete "$KEY_NAME" 2>/dev/null || true # y-script-lint:disable=or-true # idempotent cleanup: missing key is not an error + +stage "registering ssh public key as $KEY_NAME" +hcloud ssh-key create --name "$KEY_NAME" --public-key-from-file "$KEY_DIR/id.pub" >/dev/null + +stage "creating $SERVER_NAME from snapshot $SNAPSHOT_ID" +hcloud server create \ + --name "$SERVER_NAME" \ + --type "$SERVER_TYPE" \ + --image "$SNAPSHOT_ID" \ + --location "$SERVER_LOCATION" \ + --ssh-key "$KEY_NAME" \ + >/dev/null +PUBLIC_IP=$(hcloud server ip "$SERVER_NAME") +echo " public ip: $PUBLIC_IP" + +# === 5. Wait for sshd, then probe the echo HTTPRoute === +SSH_OPTS="-i $KEY_DIR/id -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" +echo " waiting for ssh on $PUBLIC_IP:22" +for _ in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS root@"$PUBLIC_IP" 'true' 2>/dev/null; then + break + fi + sleep 5 +done + +# Cold boot from snapshot: cloud-init runs (~30s), k3s.service starts +# for the first time, the envoy gateway controller comes up, the +# envoy proxy data plane comes up, the VersityGW StatefulSet +# rebinds its PV, klipper-lb binds :80. The probe loop is long +# enough to cover the whole chain on a fresh cx23. +probe() { + local what=$1 url=$2 attempts=${3:-60} + local out + out=$(mktemp) + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o "$out" -w " $what HTTP %{http_code}\n" "$url"; then + echo + echo "=== $what response (head) ===" + head -25 "$out" + echo + rm -f "$out" + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 10 + done + rm -f "$out" + return 1 +} + +stage "probing http://$PUBLIC_IP -- echo + s3" +if probe echo "http://$PUBLIC_IP/q/envoy/echo" \ + && probe s3 "http://$PUBLIC_IP/s3/health"; then + echo "=== success: cloned server serves echo + s3 ===" + echo " echo: http://$PUBLIC_IP/q/envoy/echo" + echo " s3: http://$PUBLIC_IP/s3/health" + echo " ssh: ssh -i $KEY_DIR/id root@$PUBLIC_IP" + echo " destroy: hcloud server delete $SERVER_NAME" + echo " snapshot ($SNAPSHOT_ID) preserved -- reuse with: hcloud server create --image=$SNAPSHOT_ID ..." + exit 0 +fi + +echo >&2 +echo "echo never answered within $((ATTEMPTS * 10))s. server still up for diagnosis:" >&2 +# shellcheck disable=SC2086 +ssh $SSH_OPTS root@"$PUBLIC_IP" 'systemctl is-active k3s; kubectl get pods -A 2>&1 | head -30' >&2 \ + || true # y-script-lint:disable=or-true # diagnostic best-effort -- main failure already exits 1 +echo " ssh: ssh -i $KEY_DIR/id root@$PUBLIC_IP" >&2 +echo " destroy: hcloud server delete $SERVER_NAME" >&2 +exit 1 diff --git a/scripts/e2e-appliance-qemu-to-gcp.sh b/scripts/e2e-appliance-qemu-to-gcp.sh new file mode 100755 index 0000000..6f722de --- /dev/null +++ b/scripts/e2e-appliance-qemu-to-gcp.sh @@ -0,0 +1,382 @@ +#!/usr/bin/env bash +# e2e: complete qemu-to-GCP appliance workflow, non-interactive. +# +# This is the canonical SRE example for the appliance contract: +# the disk we verify locally with qemu IS the disk that boots in +# Google Compute Engine. No re-build on the cloud side; the GCS +# tarball is exactly what `y-cluster export --format=gcp-tar` +# produced from the local provision. +# +# The workflow this script documents -- in order -- is what an +# SRE follows by hand when they want to ship a customer +# appliance to GCP: +# +# 1. Bootstrap a GCP service account in the QA project (one +# time per project; output is a JSON key the rest of the +# flow consumes via GOOGLE_APPLICATION_CREDENTIALS). +# scripts/gcp-bootstrap-credentials.sh +# +# 2. Provision a y-cluster appliance locally on qemu. This +# gives the same k3s + Envoy Gateway + bundled local-path +# stack the customer will run. +# y-cluster provision -c +# +# 3. Install the customer's workload(s). The e2e here uses +# the y-cluster echo workload + the appliance-stateful +# VersityGW StatefulSet as stand-ins; in real customer +# flows this is whatever kubectl apply / yconverge / helm +# the customer specifies. The Hetzner Object Storage +# tutorial uses VersityGW; the principle is the same. +# y-cluster echo render | kubectl apply -f - +# y-cluster yconverge -k testdata/appliance-stateful/base +# +# 4. Smoketest from the host. Anything that's reachable on +# :80 of the local qemu's port-forward is reachable on +# :80 of the eventual GCE VM. +# curl http://127.0.0.1:39080/q/envoy/echo +# +# 5. Stop the cluster cleanly so the qcow2 is quiesced. The +# graceful-stop logic flushes containerd snapshot state. +# y-cluster stop --context=$NAME +# +# 6. prepare-export: virt-customize-driven identity reset +# (machine-id retained, ssh host keys retained, cloud-init +# cleaned, netplan generic-NIC match installed, +# systemd-timesyncd enabled). This is the step that makes +# the disk portable. +# y-cluster prepare-export --context=$NAME +# +# 7. Export to GCE custom-image format. Produces +# /.tar.gz containing exactly disk.raw. +# y-cluster export --context=$NAME --format=gcp-tar +# +# 8. Upload to GCS. Bucket created on first run with +# uniform-access mode. +# gcloud storage cp /.tar.gz \ +# gs://-appliance-images/.tar.gz +# +# 9. Create custom image from the GCS object. Direct create +# (no managed conversion job). +# gcloud compute images create \ +# --source-uri=gs://-appliance-images/.tar.gz +# +# 10. Ensure firewall opens public ports. Idempotent. +# gcloud compute firewall-rules create y-cluster-appliance-public ... +# +# 11. Create VM from the image, tagged for the firewall rule. +# gcloud compute instances create \ +# --image= --tags=y-cluster-appliance ... +# +# 12. Wait for ssh + probe HTTP. The disk we just built is the +# disk now booting; if smoketest passes here, it's the same +# smoketest that passed locally. +# +# 13. Teardown: delete the VM, the image, the GCS object, the +# local cluster. The e2e is the thing that proves the +# contract; we don't leave artefacts behind. +# +# Re-run safety: every step is idempotent. Running this twice +# in a row produces the same result; partial-failure re-runs +# pick up where the previous left off (fresh teardown of any +# leftover server / image / cluster on entry). +# +# This script is the proof. The interactive variant is +# scripts/appliance-qemu-to-gcp.sh -- same flow but with +# operator prompts at the export and GCP-write boundaries. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='e2e-appliance-qemu-to-gcp.sh - canonical SRE workflow: provision -> install -> verify -> prepare-export -> export gcp-tar -> upload -> image -> instance -> probe -> teardown + +Usage: e2e-appliance-qemu-to-gcp.sh + +Environment: + GCP_PROJECT GCP project (set in .env or shell env; required) + GCP_REGION GCP region (default: europe-north2) + GCP_ZONE GCP zone (default: europe-north2-a) + GCP_BUCKET GCS bucket (default: -appliance-images) + GCP_MACHINE_TYPE Machine type (default: e2-medium) + GCP_KEY Service account JSON (set in .env or shell env; required) + NAME Cluster + VM name (default: appliance-gcp-e2e) + KEEP Set to skip teardown for diagnosis (default: tear down on success) + DEBUG Set non-empty for bash trace + +Dependencies: + go, qemu-system-x86_64, qemu-img, kubectl, ssh, ssh-keygen, curl, + virt-sysprep, gcloud +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${GCP_PROJECT:?set GCP_PROJECT in .env or shell env}" +: "${GCP_KEY:?set GCP_KEY in .env or shell env}" + +GCP_REGION="${GCP_REGION:-europe-north2}" +GCP_ZONE="${GCP_ZONE:-europe-north2-a}" +GCP_BUCKET="${GCP_BUCKET:-${GCP_PROJECT}-appliance-images}" +GCP_MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-medium}" + +NAME="${NAME:-appliance-gcp-e2e}" +APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_API_PORT="${APP_API_PORT:-39443}" +APP_SSH_PORT="${APP_SSH_PORT:-2229}" + +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" +CFG_DIR="$HOME/.cache/y-cluster-appliance-build/$NAME" +BUNDLE_DIR=$(mktemp -d -p "$REPO_ROOT/dist" "appliance-gcp-e2e.XXXXXX" 2>/dev/null \ + || mktemp -d -p /tmp "appliance-gcp-e2e.XXXXXX") + +IMAGE_NAME="$NAME-$(date -u +%Y%m%d-%H%M%S)" +VM_NAME="$NAME" + +stage() { printf '\n=== %s ===\n' "$*"; } + +teardown() { + set +e + if [[ -n "${KEEP:-}" ]]; then + echo + echo "KEEP set; preserving artefacts for diagnosis:" + echo " local cluster: $Y_CLUSTER teardown -c $CFG_DIR" + echo " GCE VM: gcloud compute instances delete $VM_NAME --project=$GCP_PROJECT --zone=$GCP_ZONE" + echo " GCE image: gcloud compute images delete $IMAGE_NAME --project=$GCP_PROJECT" + echo " GCS object: gcloud storage rm gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz --project=$GCP_PROJECT" + echo " bundle: $BUNDLE_DIR" + return + fi + stage "teardown" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet 2>/dev/null # y-script-lint:disable=or-true # idempotent cleanup: missing VM is not an error + gcloud compute images delete "$IMAGE_NAME" \ + --project="$GCP_PROJECT" --quiet 2>/dev/null # y-script-lint:disable=or-true # idempotent cleanup: missing image is not an error + gcloud storage rm "gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" \ + --project="$GCP_PROJECT" 2>/dev/null # y-script-lint:disable=or-true # idempotent cleanup: missing object is not an error + "$Y_CLUSTER" teardown -c "$CFG_DIR" 2>/dev/null # y-script-lint:disable=or-true # idempotent cleanup: missing cluster is not an error + rm -rf "$BUNDLE_DIR" +} +trap teardown EXIT + +for tool in go qemu-system-x86_64 qemu-img kubectl ssh ssh-keygen curl virt-sysprep gcloud; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +if [[ ! -f "$GCP_KEY" ]]; then + echo "missing GCP key: $GCP_KEY" >&2 + echo "create it with: scripts/gcp-bootstrap-credentials.sh" >&2 + exit 1 +fi +export GOOGLE_APPLICATION_CREDENTIALS="$GCP_KEY" + +# Acknowledge parallel composite uploads up front (silences +# the WARNING stanza gcloud would otherwise emit on every +# `storage cp` for files >150 MiB). +export CLOUDSDK_STORAGE_PARALLEL_COMPOSITE_UPLOAD_ENABLED=True + +if ! [ -r /boot/vmlinuz-"$(uname -r)" ]; then + cat >&2 </dev/null + +# === 1. Build dev binary === +stage "building dev binary -> $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === 2. Provision local qemu === +mkdir -p "$CFG_DIR" +cat > "$CFG_DIR/y-cluster-provision.yaml" < $BUNDLE_DIR" +# y-cluster export refuses non-empty bundle dirs; the mktemp -d +# above created an empty dir we own, so a fresh re-run is fine. +# On retry-after-failure paths the dir might have content from +# the previous attempt, so we wipe + let export recreate. +rm -rf "$BUNDLE_DIR" +"$Y_CLUSTER" export --context="$NAME" --format=gcp-tar "$BUNDLE_DIR" +TARBALL="$BUNDLE_DIR/$NAME.tar.gz" +echo " size: $(stat -c '%s' "$TARBALL" | numfmt --to=iec-i --suffix=B 2>/dev/null || stat -c '%s' "$TARBALL")" + +# === 8. Upload to GCS === +stage "ensuring bucket gs://$GCP_BUCKET ($GCP_REGION)" +if ! gcloud storage buckets describe "gs://$GCP_BUCKET" --project="$GCP_PROJECT" >/dev/null 2>&1; then + gcloud storage buckets create "gs://$GCP_BUCKET" \ + --project="$GCP_PROJECT" \ + --location="$GCP_REGION" \ + --uniform-bucket-level-access +fi + +stage "uploading -> gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" +gcloud storage cp "$TARBALL" "gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" --project="$GCP_PROJECT" + +# === 9. Create custom image === +stage "creating GCE custom image $IMAGE_NAME" +gcloud compute images create "$IMAGE_NAME" \ + --project="$GCP_PROJECT" \ + --source-uri="gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" \ + --family=y-cluster-appliance \ + --architecture=X86_64 \ + >/dev/null + +# === 10. Firewall (idempotent) === +FIREWALL_RULE="y-cluster-appliance-public" +stage "ensuring firewall rule $FIREWALL_RULE" +if ! gcloud compute firewall-rules describe "$FIREWALL_RULE" --project="$GCP_PROJECT" >/dev/null 2>&1; then + gcloud compute firewall-rules create "$FIREWALL_RULE" \ + --project="$GCP_PROJECT" \ + --direction=INGRESS \ + --network=default \ + --action=ALLOW \ + --rules=tcp:80,tcp:443 \ + --target-tags=y-cluster-appliance \ + --source-ranges=0.0.0.0/0 \ + >/dev/null +fi + +# === 11. Create VM === +stage "creating $VM_NAME ($GCP_MACHINE_TYPE in $GCP_ZONE)" +if gcloud compute instances describe "$VM_NAME" --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null +fi +gcloud compute instances create "$VM_NAME" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --machine-type="$GCP_MACHINE_TYPE" \ + --image="$IMAGE_NAME" \ + --image-project="$GCP_PROJECT" \ + --boot-disk-size=20GB \ + --tags=y-cluster-appliance \ + >/dev/null +PUBLIC_IP=$(gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --format='get(networkInterfaces[0].accessConfigs[0].natIP)') +echo " public ip: $PUBLIC_IP" + +# === 12. Wait for ssh + probe HTTP === +SSH_KEY="$CACHE_DIR/$NAME-ssh" +SSH_OPTS="-i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" + +echo " waiting for ssh on $PUBLIC_IP:22 (cloud-init can take 30-90s on first boot)" +ssh_up=0 +for i in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS ystack@"$PUBLIC_IP" 'true' 2>/dev/null; then + echo " ssh up after $i attempt(s)" + ssh_up=1 + break + fi + echo " ssh attempt $i/60: not yet" + sleep 5 +done +[[ $ssh_up -eq 1 ]] || { echo "ssh never came up on $PUBLIC_IP" >&2; exit 1; } + +probe_remote() { + local what=$1 url=$2 attempts=${3:-60} + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o /dev/null -w " $what HTTP %{http_code}\n" "$url"; then + return 0 + fi + echo " $what attempt $i/$attempts" + sleep 10 + done + return 1 +} + +stage "probing http://$PUBLIC_IP -- echo + s3 (same routes the local cluster served)" +if probe_remote echo "http://$PUBLIC_IP/q/envoy/echo" \ + && probe_remote s3 "http://$PUBLIC_IP/s3/health"; then + echo + echo "================================================================" + echo "PASS: appliance-qemu-to-gcp e2e." + echo + echo "Local-built disk booted in GCP and served the same routes that" + echo "the local qemu served. The appliance contract holds." + echo + echo " Public IP: $PUBLIC_IP" + echo " SSH: ssh -i $SSH_KEY ystack@$PUBLIC_IP" + echo " echo: http://$PUBLIC_IP/q/envoy/echo" + echo " s3 health: http://$PUBLIC_IP/s3/health" + echo "================================================================" + exit 0 +fi + +echo >&2 +echo "remote probes never returned; instance left for diagnosis (KEEP=1 to skip cleanup):" >&2 +# shellcheck disable=SC2086 +ssh $SSH_OPTS ystack@"$PUBLIC_IP" \ + 'sudo systemctl is-active k3s; sudo k3s kubectl get pods -A 2>&1 | head -30' >&2 \ + || true # y-script-lint:disable=or-true # diagnostic best-effort +exit 1 diff --git a/scripts/gcp-bootstrap-credentials.sh b/scripts/gcp-bootstrap-credentials.sh new file mode 100755 index 0000000..fdd29c7 --- /dev/null +++ b/scripts/gcp-bootstrap-credentials.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# Bootstrap a service-account JSON key for automation to use +# against a GCP project (typically your y-cluster appliance QA +# project; see .env.example for the operator-local default). +# +# Run this on a machine where you're already gcloud-logged-in +# as a project Owner (or Editor with IAM admin). It will: +# 1. Verify your active gcloud account can act on the project. +# 2. Enable the Compute / Storage APIs the appliance-qemu-to-gcp +# flow needs. (No Cloud Build: we convert qcow2 -> raw -> tar +# locally and use `images create --source-uri=gs://...`, which +# is a direct image create with no managed conversion job.) +# 3. Create (or reuse) a service account named +# @.iam.gserviceaccount.com. +# 4. Grant it roles/owner on the project. (QA project; broad +# role keeps the bootstrap simple. Tighten later if QA gets +# reused for non-QA assets.) +# 5. Generate a JSON key for the service account. +# 6. Print the JSON between unmistakable BEGIN/END markers so +# you can copy-paste from your terminal scrollback to the +# machine that needs the credentials. The key is also left +# on disk at $KEY_FILE in case you'd rather scp it. +# +# After copying: on the other machine, save the JSON between +# the markers (NOT the markers themselves) to a file, chmod +# 600 it, and point GCP_KEY in $REPO_ROOT/.env at it. The +# appliance scripts read GCP_KEY from .env. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='gcp-bootstrap-credentials.sh - create + grant + key a service account for the y-cluster appliance flow, then print the JSON for cross-machine copy-paste + +Usage: gcp-bootstrap-credentials.sh + +Environment: + GCP_PROJECT GCP project (set in .env or shell env; required) + SA_NAME Service account local part (default: y-cluster-appliance) + KEY_FILE Where to write the JSON key on this machine + (default: ./y-cluster-gcp-key.json) + DEBUG Set non-empty for bash trace + +Dependencies: + gcloud (logged in as a Project Owner or equivalent) +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${GCP_PROJECT:?set GCP_PROJECT in .env or shell env}" +PROJECT_ID="$GCP_PROJECT" +SA_NAME="${SA_NAME:-y-cluster-appliance}" +SA_EMAIL="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" +KEY_FILE="${KEY_FILE:-./y-cluster-gcp-key.json}" + +command -v gcloud >/dev/null || { echo "gcloud not found on PATH" >&2; exit 1; } + +stage() { printf '\n=== %s ===\n' "$*"; } + +# 1. Verify caller is logged in and can see the project. +stage "verifying gcloud auth + project access ($PROJECT_ID)" +ACTIVE=$(gcloud auth list --filter=status:ACTIVE --format="value(account)" 2>/dev/null || true) # y-script-lint:disable=or-true # gcloud returns nonzero when no active account; we surface our own error below +if [[ -z "$ACTIVE" ]]; then + echo "no active gcloud account; run: gcloud auth login" >&2 + exit 1 +fi +echo " active account: $ACTIVE" +gcloud projects describe "$PROJECT_ID" --format="value(projectId)" >/dev/null \ + || { echo "cannot read project $PROJECT_ID with $ACTIVE" >&2; exit 1; } + +# 2. Enable required APIs. Idempotent: gcloud reports the +# already-enabled ones as no-ops. +stage "enabling APIs (compute, storage)" +gcloud services enable \ + compute.googleapis.com \ + storage.googleapis.com \ + --project="$PROJECT_ID" + +# 3. Create the service account (idempotent: skip if it +# exists). gcloud doesn't ship a clean "create or skip", so +# we probe first. +stage "creating service account $SA_EMAIL (idempotent)" +if gcloud iam service-accounts describe "$SA_EMAIL" \ + --project="$PROJECT_ID" >/dev/null 2>&1; then + echo " already exists, reusing" +else + gcloud iam service-accounts create "$SA_NAME" \ + --display-name="y-cluster appliance automation" \ + --description="Used by scripts/appliance-qemu-to-gcp.sh to upload custom images and provision VMs in $PROJECT_ID" \ + --project="$PROJECT_ID" +fi + +# 4. Grant roles/owner on the project. QA project; broad role +# is intentional and matches the project's stated purpose. If +# this account ever gets reused for non-QA assets, tighten to +# the union of: compute.admin, storage.admin, +# iam.serviceAccountUser. +stage "granting roles/owner on $PROJECT_ID to $SA_EMAIL" +gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$SA_EMAIL" \ + --role="roles/owner" \ + --project="$PROJECT_ID" \ + --condition=None \ + >/dev/null + +# 5. Mint a fresh JSON key. Each invocation creates a new key. +# GCP allows up to 10 keys per service account; if the operator +# is rotating, they can `gcloud iam service-accounts keys list` +# and delete the stale ones with `keys delete`. +stage "minting JSON key -> $KEY_FILE" +rm -f "$KEY_FILE" +gcloud iam service-accounts keys create "$KEY_FILE" \ + --iam-account="$SA_EMAIL" \ + --project="$PROJECT_ID" +chmod 600 "$KEY_FILE" + +# 6. Print the JSON between markers for clipboard-friendly +# copy. Markers are exact strings the destination machine can +# grep for if they want to extract programmatically. +echo +echo "================================================================" +echo "JSON key for $SA_EMAIL" +echo "Project: $PROJECT_ID" +echo +echo "On the destination machine, save the lines BETWEEN the" +echo "----- BEGIN ... ----- and ----- END ... ----- markers" +echo "(NOT the markers themselves) to a file, then:" +echo " chmod 600 " +echo " set GCP_KEY= in \$REPO_ROOT/.env" +echo "================================================================" +echo +echo "----- BEGIN GCP SERVICE ACCOUNT KEY ($SA_EMAIL) -----" +cat "$KEY_FILE" +echo +echo "----- END GCP SERVICE ACCOUNT KEY ($SA_EMAIL) -----" +echo +echo "Local copy of the key (kept for scp / re-paste): $KEY_FILE" +echo "To revoke this key later:" +echo " gcloud iam service-accounts keys list --iam-account=$SA_EMAIL --project=$PROJECT_ID" +echo " gcloud iam service-accounts keys delete --iam-account=$SA_EMAIL --project=$PROJECT_ID" From c4f1cecf718bf80a35326ff91f6fd8d05f429464 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Mon, 4 May 2026 16:42:13 +0000 Subject: [PATCH 02/16] test: align local gateway access on port 80 (e2e + scripts) Yolean dev / setup scripts that smoke-test the gateway expect a host-side port that reaches guest 80. Today's qemu-side host port forwards default to 39080 in both the Go e2e helper and the bash appliance-build scripts, so any consumer that hardcodes http://localhost:80 has to remember the offset. This host (and most modern Linux distros) ships net.ipv4.ip_unprivileged_port_start=80, so qemu's user-mode hostfwd inherits the ability to bind port 80 without root. Default APP_HTTP_PORT and the e2e port-forward helper to 80 in lockstep: - e2e/qemu_test.go: e2eUniqueForwards now takes both apiPort and httpPort; every test passes its own pair (28443 / 28444 / ... vs 26443 / 26444 / ...) keyed off the apiPort so concurrent test runs on the same host don't collide. Each test always gets a guest-80 forward, matching what the appliance-build scripts install. - scripts/appliance-{qemu-to-gcp,build-hetzner,build-virtualbox}.sh + scripts/e2e-appliance-{export-import,qemu-to-gcp}.sh: the APP_HTTP_PORT default flips from 39080 to 80, with YHELP / inline curl examples updated to match. Override via env (APP_HTTP_PORT=39080 ./scripts/...) on hosts that keep port 80 privileged. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-build-hetzner.sh | 4 ++-- scripts/appliance-build-virtualbox.sh | 4 ++-- scripts/appliance-qemu-to-gcp.sh | 4 ++-- scripts/e2e-appliance-export-import.sh | 4 ++-- scripts/e2e-appliance-qemu-to-gcp.sh | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/appliance-build-hetzner.sh b/scripts/appliance-build-hetzner.sh index ee91740..628d88a 100755 --- a/scripts/appliance-build-hetzner.sh +++ b/scripts/appliance-build-hetzner.sh @@ -30,7 +30,7 @@ Environment: ENV_FILE Hetzner credentials file (set in .env or shell env; required) HCLOUD_TOKEN Hetzner Cloud API token (sourced from ENV_FILE) NAME Local cluster name (default: appliance-hetzner-build) - APP_HTTP_PORT Local host port -> guest 80 (default: 39080) + APP_HTTP_PORT Local host port -> guest 80 (default: 80) APP_API_PORT Local host port -> guest 6443 (default: 39443) APP_SSH_PORT Local host port -> guest 22 (default: 2229) SERVER_NAME Hetzner server name (default: y-cluster-appliance) @@ -60,7 +60,7 @@ fi : "${ENV_FILE:?set ENV_FILE in .env or shell env}" NAME="${NAME:-appliance-hetzner-build}" -APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_HTTP_PORT="${APP_HTTP_PORT:-80}" APP_API_PORT="${APP_API_PORT:-39443}" APP_SSH_PORT="${APP_SSH_PORT:-2229}" SERVER_NAME="${SERVER_NAME:-y-cluster-appliance}" diff --git a/scripts/appliance-build-virtualbox.sh b/scripts/appliance-build-virtualbox.sh index 923aaa6..f8db0bb 100755 --- a/scripts/appliance-build-virtualbox.sh +++ b/scripts/appliance-build-virtualbox.sh @@ -32,7 +32,7 @@ Positional: Environment: NAME Appliance name (default: appliance-virtualbox) - APP_HTTP_PORT Host port -> guest 80 (default: 39080) + APP_HTTP_PORT Host port -> guest 80 (default: 80) APP_API_PORT Host port -> guest 6443 (default: 39443) APP_SSH_PORT Host port -> guest 22 (default: 2229) Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) @@ -53,7 +53,7 @@ case "${1:-}" in esac NAME="${NAME:-appliance-virtualbox}" -APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_HTTP_PORT="${APP_HTTP_PORT:-80}" APP_API_PORT="${APP_API_PORT:-39443}" APP_SSH_PORT="${APP_SSH_PORT:-2229}" diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 83da3f2..96b0f55 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -74,7 +74,7 @@ Environment: something else, or delete the existing one. IMAGE_NAME Custom image name in GCE (default: -) VM_NAME Compute Engine VM name (default: $NAME) - APP_HTTP_PORT Local host port -> guest 80 (default: 39080) + APP_HTTP_PORT Local host port -> guest 80 (default: 80) APP_API_PORT Local host port -> guest 6443 (default: 39443) APP_SSH_PORT Local host port -> guest 22 (default: 2229) Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) @@ -114,7 +114,7 @@ NAME="${NAME:-appliance-gcp-build}" KUBECTX="${KUBECTX:-local}" IMAGE_NAME="${IMAGE_NAME:-${NAME}-$(date -u +%Y%m%d-%H%M%S)}" VM_NAME="${VM_NAME:-$NAME}" -APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_HTTP_PORT="${APP_HTTP_PORT:-80}" APP_API_PORT="${APP_API_PORT:-39443}" APP_SSH_PORT="${APP_SSH_PORT:-2229}" diff --git a/scripts/e2e-appliance-export-import.sh b/scripts/e2e-appliance-export-import.sh index 5d7ca14..e5bae04 100755 --- a/scripts/e2e-appliance-export-import.sh +++ b/scripts/e2e-appliance-export-import.sh @@ -50,7 +50,7 @@ Usage: e2e-appliance-export-import.sh Environment: NAME Appliance name (default: appliance-export-test) - APP_HTTP_PORT Build-side host port -> guest 80 (default: 39080) + APP_HTTP_PORT Build-side host port -> guest 80 (default: 80) APP_API_PORT Build-side host port -> guest 6443 (default: 39443) APP_SSH_PORT Build-side host port -> guest 22 (default: 2229) IMP_HTTP_PORT Import-side host port -> guest 80 (default: 39180) @@ -75,7 +75,7 @@ case "${1:-}" in esac NAME="${NAME:-appliance-export-test}" -APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_HTTP_PORT="${APP_HTTP_PORT:-80}" APP_API_PORT="${APP_API_PORT:-39443}" APP_SSH_PORT="${APP_SSH_PORT:-2229}" IMP_HTTP_PORT="${IMP_HTTP_PORT:-39180}" diff --git a/scripts/e2e-appliance-qemu-to-gcp.sh b/scripts/e2e-appliance-qemu-to-gcp.sh index 6f722de..fd1225c 100755 --- a/scripts/e2e-appliance-qemu-to-gcp.sh +++ b/scripts/e2e-appliance-qemu-to-gcp.sh @@ -33,7 +33,7 @@ # 4. Smoketest from the host. Anything that's reachable on # :80 of the local qemu's port-forward is reachable on # :80 of the eventual GCE VM. -# curl http://127.0.0.1:39080/q/envoy/echo +# curl http://127.0.0.1:80/q/envoy/echo # # 5. Stop the cluster cleanly so the qcow2 is quiesced. The # graceful-stop logic flushes containerd snapshot state. @@ -127,7 +127,7 @@ GCP_BUCKET="${GCP_BUCKET:-${GCP_PROJECT}-appliance-images}" GCP_MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-medium}" NAME="${NAME:-appliance-gcp-e2e}" -APP_HTTP_PORT="${APP_HTTP_PORT:-39080}" +APP_HTTP_PORT="${APP_HTTP_PORT:-80}" APP_API_PORT="${APP_API_PORT:-39443}" APP_SSH_PORT="${APP_SSH_PORT:-2229}" From 72085210f1b5cb11ce899ccdb3cebe4043a9ffb0 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Mon, 4 May 2026 16:46:24 +0000 Subject: [PATCH 03/16] test: bump appliance VM/disk size to 40G (avoid disk-pressure flakes) Appliance e2e / build flows install workloads, build a seed tarball, prepare-export, and re-boot from the prepared disk -- the cumulative footprint pushes the 20G default disk into pressure on the kubelet's image-gc thresholds, which surfaces as flaky pod evictions mid-test or mid-build. Bump to 40G everywhere a 20G default sat: - e2e/qemu_test.go: e2eQEMURuntime overrides DiskSize to 40G so every qemu e2e test boots with the larger disk by default. - scripts/appliance-{qemu-to-gcp,build-hetzner,build-virtualbox}.sh + scripts/e2e-appliance-{export-import,qemu-to-gcp}.sh: the generated y-cluster-provision.yaml now sets diskSize: "40G". - scripts/appliance-qemu-to-gcp.sh: --boot-disk-size on `gcloud compute instances create` flips from 20GB to 40GB so the GCE VM doesn't reject the 40G custom image with "Requested disk size cannot be smaller than the image size". qcow2 is sparse, so the host-disk footprint only grows with actual usage; the larger virtual size is a no-cost ceiling. The GCE side similarly uses a thin-provisioned persistent disk. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-build-hetzner.sh | 1 + scripts/appliance-build-virtualbox.sh | 1 + scripts/appliance-qemu-to-gcp.sh | 3 ++- scripts/e2e-appliance-export-import.sh | 1 + scripts/e2e-appliance-qemu-to-gcp.sh | 1 + 5 files changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/appliance-build-hetzner.sh b/scripts/appliance-build-hetzner.sh index 628d88a..de5cb0b 100755 --- a/scripts/appliance-build-hetzner.sh +++ b/scripts/appliance-build-hetzner.sh @@ -162,6 +162,7 @@ context: $NAME sshPort: "$APP_SSH_PORT" memory: "4096" cpus: "2" +diskSize: "40G" portForwards: - host: "$APP_API_PORT" guest: "6443" diff --git a/scripts/appliance-build-virtualbox.sh b/scripts/appliance-build-virtualbox.sh index f8db0bb..6c67343 100755 --- a/scripts/appliance-build-virtualbox.sh +++ b/scripts/appliance-build-virtualbox.sh @@ -106,6 +106,7 @@ context: $NAME sshPort: "$APP_SSH_PORT" memory: "4096" cpus: "2" +diskSize: "40G" portForwards: - host: "$APP_API_PORT" guest: "6443" diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 96b0f55..f929d17 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -297,6 +297,7 @@ context: $KUBECTX sshPort: "$APP_SSH_PORT" memory: "4096" cpus: "2" +diskSize: "40G" portForwards: - host: "$APP_API_PORT" guest: "6443" @@ -507,7 +508,7 @@ gcloud compute instances create "$VM_NAME" \ --machine-type="$GCP_MACHINE_TYPE" \ --image="$IMAGE_NAME" \ --image-project="$GCP_PROJECT" \ - --boot-disk-size=20GB \ + --boot-disk-size=40GB \ --disk="name=$GCP_DATADIR_DISK,device-name=datadir,mode=rw,boot=no" \ --tags=y-cluster-appliance \ >/dev/null diff --git a/scripts/e2e-appliance-export-import.sh b/scripts/e2e-appliance-export-import.sh index e5bae04..86c295c 100755 --- a/scripts/e2e-appliance-export-import.sh +++ b/scripts/e2e-appliance-export-import.sh @@ -142,6 +142,7 @@ context: $NAME sshPort: "$APP_SSH_PORT" memory: "4096" cpus: "2" +diskSize: "40G" portForwards: - host: "$APP_API_PORT" guest: "6443" diff --git a/scripts/e2e-appliance-qemu-to-gcp.sh b/scripts/e2e-appliance-qemu-to-gcp.sh index fd1225c..0548ce3 100755 --- a/scripts/e2e-appliance-qemu-to-gcp.sh +++ b/scripts/e2e-appliance-qemu-to-gcp.sh @@ -209,6 +209,7 @@ context: $NAME sshPort: "$APP_SSH_PORT" memory: "4096" cpus: "2" +diskSize: "40G" portForwards: - host: "$APP_API_PORT" guest: "6443" From b3f4791d3c694526a415d41ee0fdde808530a58d Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Tue, 5 May 2026 09:39:04 +0000 Subject: [PATCH 04/16] chore(scripts): drop port defaults from bash, defer to y-cluster's The appliance-build / e2e scripts each carried a defaults block: APP_HTTP_PORT="${APP_HTTP_PORT:-80}" APP_API_PORT="${APP_API_PORT:-39443}" APP_SSH_PORT="${APP_SSH_PORT:-2229}" then interpolated those into the heredoc'd y-cluster-provision.yaml. Three of the four values restate y-cluster's own defaults (80/6443/2222 in pkg/provision/config); the bash defaults that DIFFERED (39443 vs 6443; 2229 vs 2222) were chosen for collision avoidance against an operator's regular y-cluster, but were quiet duplicates of the same defaulting concept. Replace the heredoc with a brace block that emits each port field ONLY when the env var is set. Net behaviour: - No env override -> minimal YAML; y-cluster fills 2222 + {6443:6443, 80:80, 443:443}. - APP_HTTP_PORT=N -> only the host:N -> guest:80 entry lands; API/SSH still y-cluster-default. - Multiple set -> all set entries land; requireHostAPIPort validates that a guest:6443 entry exists. Display refs (banner curl examples, ssh commands, smoketest probes) use ${APP_*_PORT:-NN} inline so the printed URL/SSH command shows the right port whether overridden or default. YHELP entries reworded from "(default: 80)" to "(y-cluster default: 80)" so the operator sees who owns the default. IMP_HTTP_PORT / IMP_SSH_PORT in e2e-appliance-export-import.sh left as-is (test-only; the import-side qemu is started directly, no y-cluster CLI involvement, so y-cluster's defaults don't apply). Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-build-hetzner.sh | 52 ++++++++++++------------- scripts/appliance-build-virtualbox.sh | 54 +++++++++++++------------- scripts/appliance-qemu-to-gcp.sh | 50 ++++++++++++------------ scripts/e2e-appliance-export-import.sh | 48 ++++++++++++----------- scripts/e2e-appliance-qemu-to-gcp.sh | 38 +++++++++--------- 5 files changed, 124 insertions(+), 118 deletions(-) diff --git a/scripts/appliance-build-hetzner.sh b/scripts/appliance-build-hetzner.sh index de5cb0b..e2095ea 100755 --- a/scripts/appliance-build-hetzner.sh +++ b/scripts/appliance-build-hetzner.sh @@ -30,9 +30,9 @@ Environment: ENV_FILE Hetzner credentials file (set in .env or shell env; required) HCLOUD_TOKEN Hetzner Cloud API token (sourced from ENV_FILE) NAME Local cluster name (default: appliance-hetzner-build) - APP_HTTP_PORT Local host port -> guest 80 (default: 80) - APP_API_PORT Local host port -> guest 6443 (default: 39443) - APP_SSH_PORT Local host port -> guest 22 (default: 2229) + APP_HTTP_PORT Override host port for guest 80 (y-cluster default: 80) + APP_API_PORT Override host port for guest 6443 (y-cluster default: 6443) + APP_SSH_PORT Override host port for guest 22 (y-cluster default: 2222) SERVER_NAME Hetzner server name (default: y-cluster-appliance) SERVER_TYPE Hetzner server type (default: cx23) SERVER_LOCATION Hetzner location (default: hel1) @@ -60,9 +60,6 @@ fi : "${ENV_FILE:?set ENV_FILE in .env or shell env}" NAME="${NAME:-appliance-hetzner-build}" -APP_HTTP_PORT="${APP_HTTP_PORT:-80}" -APP_API_PORT="${APP_API_PORT:-39443}" -APP_SSH_PORT="${APP_SSH_PORT:-2229}" SERVER_NAME="${SERVER_NAME:-y-cluster-appliance}" SERVER_TYPE="${SERVER_TYPE:-cx23}" SERVER_LOCATION="${SERVER_LOCATION:-hel1}" @@ -155,20 +152,23 @@ mkdir -p "$(dirname "$Y_CLUSTER")" # === Local config === mkdir -p "$CFG_DIR" -cat > "$CFG_DIR/y-cluster-provision.yaml" < "$CFG_DIR/y-cluster-provision.yaml" # === Stage 1: local provision + install + smoketest === stage "tearing down any leftover $NAME cluster" @@ -200,24 +200,24 @@ probe_local() { echo "$what smoketest never succeeded; aborting" >&2 return 1 } -probe_local echo "http://127.0.0.1:$APP_HTTP_PORT/q/envoy/echo" -probe_local s3 "http://127.0.0.1:$APP_HTTP_PORT/s3/health" +probe_local echo "http://127.0.0.1:${APP_HTTP_PORT:-80}/q/envoy/echo" +probe_local s3 "http://127.0.0.1:${APP_HTTP_PORT:-80}/s3/health" cat < guest 80 (default: 80) - APP_API_PORT Host port -> guest 6443 (default: 39443) - APP_SSH_PORT Host port -> guest 22 (default: 2229) + APP_HTTP_PORT Override host port for guest 80 (y-cluster default: 80) + APP_API_PORT Override host port for guest 6443 (y-cluster default: 6443) + APP_SSH_PORT Override host port for guest 22 (y-cluster default: 2222) Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) KEEP_CLUSTER Set to keep the cluster alive after export (default: tear it down) @@ -53,9 +53,6 @@ case "${1:-}" in esac NAME="${NAME:-appliance-virtualbox}" -APP_HTTP_PORT="${APP_HTTP_PORT:-80}" -APP_API_PORT="${APP_API_PORT:-39443}" -APP_SSH_PORT="${APP_SSH_PORT:-2229}" REPO_ROOT="$(git rev-parse --show-toplevel)" Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" @@ -99,20 +96,23 @@ mkdir -p "$(dirname "$Y_CLUSTER")" # === Config (always written; teardown + prepare-export need it) === mkdir -p "$CFG_DIR" -cat > "$CFG_DIR/y-cluster-provision.yaml" < "$CFG_DIR/y-cluster-provision.yaml" if [[ -z "${SKIP_PROVISION:-}" ]]; then stage "tearing down any leftover $NAME cluster" @@ -145,8 +145,8 @@ if [[ -z "${SKIP_PROVISION:-}" ]]; then echo "$what smoketest never succeeded; aborting" >&2 return 1 } - probe echo "http://127.0.0.1:$APP_HTTP_PORT/q/envoy/echo" - probe s3 "http://127.0.0.1:$APP_HTTP_PORT/s3/health" + probe echo "http://127.0.0.1:${APP_HTTP_PORT:-80}/q/envoy/echo" + probe s3 "http://127.0.0.1:${APP_HTTP_PORT:-80}/s3/health" else stage "SKIP_PROVISION set; resuming against existing $NAME cluster" fi @@ -159,19 +159,19 @@ cat <-) VM_NAME Compute Engine VM name (default: $NAME) - APP_HTTP_PORT Local host port -> guest 80 (default: 80) - APP_API_PORT Local host port -> guest 6443 (default: 39443) - APP_SSH_PORT Local host port -> guest 22 (default: 2229) + APP_HTTP_PORT Override host port for guest 80 (y-cluster default: 80) + APP_API_PORT Override host port for guest 6443 (y-cluster default: 6443) + APP_SSH_PORT Override host port for guest 22 (y-cluster default: 2222) Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) KEEP_LOCAL Set to keep the local cluster after upload (default: tear down) @@ -114,9 +114,6 @@ NAME="${NAME:-appliance-gcp-build}" KUBECTX="${KUBECTX:-local}" IMAGE_NAME="${IMAGE_NAME:-${NAME}-$(date -u +%Y%m%d-%H%M%S)}" VM_NAME="${VM_NAME:-$NAME}" -APP_HTTP_PORT="${APP_HTTP_PORT:-80}" -APP_API_PORT="${APP_API_PORT:-39443}" -APP_SSH_PORT="${APP_SSH_PORT:-2229}" Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" @@ -290,20 +287,25 @@ mkdir -p "$(dirname "$Y_CLUSTER")" ( cd "$REPO_ROOT" && go build -o "$Y_CLUSTER" ./cmd/y-cluster ) mkdir -p "$CFG_DIR" -cat > "$CFG_DIR/y-cluster-provision.yaml" < "$CFG_DIR/y-cluster-provision.yaml" stage "tearing down any leftover $NAME cluster" "$Y_CLUSTER" teardown -c "$CFG_DIR" || true # y-script-lint:disable=or-true # idempotent re-entry: missing cluster is not an error @@ -345,9 +347,9 @@ cat < guest 80 (default: 80) - APP_API_PORT Build-side host port -> guest 6443 (default: 39443) - APP_SSH_PORT Build-side host port -> guest 22 (default: 2229) + APP_HTTP_PORT Override build-side host port for guest 80 (y-cluster default: 80) + APP_API_PORT Override build-side host port for guest 6443 (y-cluster default: 6443) + APP_SSH_PORT Override build-side host port for guest 22 (y-cluster default: 2222) IMP_HTTP_PORT Import-side host port -> guest 80 (default: 39180) IMP_SSH_PORT Import-side host port -> guest 22 (default: 2230) Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) @@ -75,9 +75,10 @@ case "${1:-}" in esac NAME="${NAME:-appliance-export-test}" -APP_HTTP_PORT="${APP_HTTP_PORT:-80}" -APP_API_PORT="${APP_API_PORT:-39443}" -APP_SSH_PORT="${APP_SSH_PORT:-2229}" +# Import-side host ports: kept hardcoded (not env-overridable + +# defaulted) because the import-side qemu is started directly by +# this script (no y-cluster CLI involvement) and these values +# can't collide with the build-side y-cluster's defaults. IMP_HTTP_PORT="${IMP_HTTP_PORT:-39180}" IMP_SSH_PORT="${IMP_SSH_PORT:-2230}" @@ -135,20 +136,23 @@ stage "tearing down any leftover $NAME cluster" # We need the config in place for teardown to find the cluster, so # write it BEFORE the teardown attempt. teardown is idempotent # (no-op when the cluster doesn't exist) so re-entry is safe. -cat > "$CFG_DIR/y-cluster-provision.yaml" < "$CFG_DIR/y-cluster-provision.yaml" "$Y_CLUSTER" teardown -c "$CFG_DIR" || true # y-script-lint:disable=or-true # idempotent re-entry: missing cluster is not an error rm -f "$CACHE_DIR/$NAME".* "$CACHE_DIR/$NAME-"* @@ -192,8 +196,8 @@ probe() { rm -f "$out" return 1 } -probe echo "http://127.0.0.1:$APP_HTTP_PORT/q/envoy/echo" -probe s3 "http://127.0.0.1:$APP_HTTP_PORT/s3/health" +probe echo "http://127.0.0.1:${APP_HTTP_PORT:-80}/q/envoy/echo" +probe s3 "http://127.0.0.1:${APP_HTTP_PORT:-80}/s3/health" # === 5. stop + prepare-export === # y-cluster stop owns the graceful guest shutdown (ssh diff --git a/scripts/e2e-appliance-qemu-to-gcp.sh b/scripts/e2e-appliance-qemu-to-gcp.sh index 0548ce3..d901643 100755 --- a/scripts/e2e-appliance-qemu-to-gcp.sh +++ b/scripts/e2e-appliance-qemu-to-gcp.sh @@ -127,9 +127,6 @@ GCP_BUCKET="${GCP_BUCKET:-${GCP_PROJECT}-appliance-images}" GCP_MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-medium}" NAME="${NAME:-appliance-gcp-e2e}" -APP_HTTP_PORT="${APP_HTTP_PORT:-80}" -APP_API_PORT="${APP_API_PORT:-39443}" -APP_SSH_PORT="${APP_SSH_PORT:-2229}" Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" @@ -202,20 +199,23 @@ mkdir -p "$(dirname "$Y_CLUSTER")" # === 2. Provision local qemu === mkdir -p "$CFG_DIR" -cat > "$CFG_DIR/y-cluster-provision.yaml" < "$CFG_DIR/y-cluster-provision.yaml" stage "tearing down any leftover $NAME cluster" "$Y_CLUSTER" teardown -c "$CFG_DIR" || true # y-script-lint:disable=or-true # idempotent re-entry: missing cluster is not an error @@ -247,8 +247,8 @@ probe_local() { done return 1 } -probe_local echo "http://127.0.0.1:$APP_HTTP_PORT/q/envoy/echo" -probe_local s3 "http://127.0.0.1:$APP_HTTP_PORT/s3/health" +probe_local echo "http://127.0.0.1:${APP_HTTP_PORT:-80}/q/envoy/echo" +probe_local s3 "http://127.0.0.1:${APP_HTTP_PORT:-80}/s3/health" # === 5. Stop === stage "stopping cluster" From fe59a25060f482058e0be12ead7a602b0fe617f2 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Tue, 5 May 2026 09:47:59 +0000 Subject: [PATCH 05/16] chore(scripts): also forward host:guest 443 conditionally Symmetric with APP_HTTP_PORT / APP_API_PORT: a new APP_HTTPS_PORT env var lets operators override the host port forwarded to guest 443. Unset means "let y-cluster apply its default" -- the YAML still omits the field when no port var is set, which matches the behaviour for the other ports. Without this, an operator who overrides any one of {HTTP, API} silently lost 443 forwarding (the YAML's portForwards block became canonical and didn't include 443; previously y-cluster's [6443:6443, 80:80, 443:443] default applied only when the bash emitted no portForwards at all). The host:guest match keeps standard ports inside the appliance unchanged; the host-side ip_unprivileged_port_start sysctl on modern Linux distros allows binding 443 without root the same way 80 already does. YHELP entries updated to surface the new knob. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-build-hetzner.sh | 8 +++++--- scripts/appliance-build-virtualbox.sh | 8 +++++--- scripts/appliance-qemu-to-gcp.sh | 8 +++++--- scripts/e2e-appliance-export-import.sh | 8 +++++--- scripts/e2e-appliance-qemu-to-gcp.sh | 7 ++++--- 5 files changed, 24 insertions(+), 15 deletions(-) diff --git a/scripts/appliance-build-hetzner.sh b/scripts/appliance-build-hetzner.sh index e2095ea..53893b9 100755 --- a/scripts/appliance-build-hetzner.sh +++ b/scripts/appliance-build-hetzner.sh @@ -31,6 +31,7 @@ Environment: HCLOUD_TOKEN Hetzner Cloud API token (sourced from ENV_FILE) NAME Local cluster name (default: appliance-hetzner-build) APP_HTTP_PORT Override host port for guest 80 (y-cluster default: 80) + APP_HTTPS_PORT Override host port for guest 443 (y-cluster default: 443) APP_API_PORT Override host port for guest 6443 (y-cluster default: 6443) APP_SSH_PORT Override host port for guest 22 (y-cluster default: 2222) SERVER_NAME Hetzner server name (default: y-cluster-appliance) @@ -163,10 +164,11 @@ mkdir -p "$CFG_DIR" echo 'memory: "4096"' echo 'cpus: "2"' echo 'diskSize: "40G"' - if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then echo "portForwards:" - [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" - [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" fi } > "$CFG_DIR/y-cluster-provision.yaml" diff --git a/scripts/appliance-build-virtualbox.sh b/scripts/appliance-build-virtualbox.sh index b6b828d..a7ed227 100755 --- a/scripts/appliance-build-virtualbox.sh +++ b/scripts/appliance-build-virtualbox.sh @@ -33,6 +33,7 @@ Positional: Environment: NAME Appliance name (default: appliance-virtualbox) APP_HTTP_PORT Override host port for guest 80 (y-cluster default: 80) + APP_HTTPS_PORT Override host port for guest 443 (y-cluster default: 443) APP_API_PORT Override host port for guest 6443 (y-cluster default: 6443) APP_SSH_PORT Override host port for guest 22 (y-cluster default: 2222) Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) @@ -107,10 +108,11 @@ mkdir -p "$CFG_DIR" echo 'memory: "4096"' echo 'cpus: "2"' echo 'diskSize: "40G"' - if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then echo "portForwards:" - [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" - [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" fi } > "$CFG_DIR/y-cluster-provision.yaml" diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 999460a..28dcf31 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -75,6 +75,7 @@ Environment: IMAGE_NAME Custom image name in GCE (default: -) VM_NAME Compute Engine VM name (default: $NAME) APP_HTTP_PORT Override host port for guest 80 (y-cluster default: 80) + APP_HTTPS_PORT Override host port for guest 443 (y-cluster default: 443) APP_API_PORT Override host port for guest 6443 (y-cluster default: 6443) APP_SSH_PORT Override host port for guest 22 (y-cluster default: 2222) Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) @@ -300,10 +301,11 @@ mkdir -p "$CFG_DIR" echo 'memory: "4096"' echo 'cpus: "2"' echo 'diskSize: "40G"' - if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then echo "portForwards:" - [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" - [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" fi } > "$CFG_DIR/y-cluster-provision.yaml" diff --git a/scripts/e2e-appliance-export-import.sh b/scripts/e2e-appliance-export-import.sh index 7b90f1c..e4b5099 100755 --- a/scripts/e2e-appliance-export-import.sh +++ b/scripts/e2e-appliance-export-import.sh @@ -51,6 +51,7 @@ Usage: e2e-appliance-export-import.sh Environment: NAME Appliance name (default: appliance-export-test) APP_HTTP_PORT Override build-side host port for guest 80 (y-cluster default: 80) + APP_HTTPS_PORT Override build-side host port for guest 443 (y-cluster default: 443) APP_API_PORT Override build-side host port for guest 6443 (y-cluster default: 6443) APP_SSH_PORT Override build-side host port for guest 22 (y-cluster default: 2222) IMP_HTTP_PORT Import-side host port -> guest 80 (default: 39180) @@ -147,10 +148,11 @@ stage "tearing down any leftover $NAME cluster" echo 'memory: "4096"' echo 'cpus: "2"' echo 'diskSize: "40G"' - if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then echo "portForwards:" - [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" - [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" fi } > "$CFG_DIR/y-cluster-provision.yaml" diff --git a/scripts/e2e-appliance-qemu-to-gcp.sh b/scripts/e2e-appliance-qemu-to-gcp.sh index d901643..8f16523 100755 --- a/scripts/e2e-appliance-qemu-to-gcp.sh +++ b/scripts/e2e-appliance-qemu-to-gcp.sh @@ -210,10 +210,11 @@ mkdir -p "$CFG_DIR" echo 'memory: "4096"' echo 'cpus: "2"' echo 'diskSize: "40G"' - if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then echo "portForwards:" - [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" - [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" fi } > "$CFG_DIR/y-cluster-provision.yaml" From d30020b9c72ec0b8dd1043ff9d02ae4b33dafc89 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Tue, 5 May 2026 10:47:39 +0000 Subject: [PATCH 06/16] feat(scripts/qemu-to-gcp): optional regional HTTPS LB at end-of-build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a post-deploy step that offers to stand up a GCP regional External Application Load Balancer in front of the appliance VM with a self-signed cert covering operator-supplied FQDNs. Idempotent (describe-then-create) so re-runs converge; teardown integrated into the existing teardown subcommand. Why a self-signed cert and a prompt-not-default The cert-manager → upload-real-cert path is the eventual production shape, but for the dev loop a self-signed cert lets the operator verify the LB stack + HTTPRoute hostname matching without DNS / CA dependencies. The opt-in default is a billing meter (forwarding rule ~hourly, reserved IP) the operator should deliberately accept; we don't want a forgotten ASSUME_YES run to silently provision one. Operator UX - Default: prompts after the HTTP probe with a one-paragraph explainer (cost, self-signed cert, HTTPRoute prerequisite), accepts comma-separated FQDNs, empty skips. - TLS_DOMAINS env var preset: skips the prompt and runs. - ASSUME_YES alone: skips silently (unattended e2e shouldn't surprise-bill). - Final banner prints the LB IP + a single /etc/hosts line covering all FQDNs, marks the cert SELF-SIGNED, points at the gcloud commands to swap in a real cert later. Resources, all named ${NAME}-tls-* proxy-only subnet (reuses any ACTIVE one in the region; creates per-build only when none exists) static regional IP SSL cert (uploaded, self-signed) HTTP health check on /q/envoy/echo zonal NEG with the VM as endpoint backend service (EXTERNAL_MANAGED) + add-backend URL map (default-service points at the backend) target HTTPS proxy forwarding rule on :443 Teardown do_tls_teardown is invoked from the existing do_teardown so a plain `appliance-qemu-to-gcp.sh teardown` cleans up the LB stack alongside the VM/image/object/disk. Order forces the forwarding rule first (stops the meter), then proxy / url-map / backend / NEG / health-check / cert / IP. Subnet last and only when it's the per-build one (we never delete a reused regional subnet). Each delete is idempotent: missing resources are not errors. The `Will DELETE:` inventory now lists `${NAME}-tls-*` when a forwarding rule of that shape exists. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-qemu-to-gcp.sh | 257 ++++++++++++++++++++++++++++++- 1 file changed, 256 insertions(+), 1 deletion(-) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 28dcf31..885ba6c 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -82,7 +82,14 @@ Environment: CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) KEEP_LOCAL Set to keep the local cluster after upload (default: tear down) KEEP_BUNDLE Set to keep the local export bundle (default: keep -- bundle path printed) - ASSUME_YES Skip BOTH confirmations and proceed end-to-end + ASSUME_YES Skip BOTH confirmations and proceed end-to-end. + Also suppresses the optional TLS-LB prompt; set + TLS_DOMAINS alongside to opt in unattended. + TLS_DOMAINS Comma-separated FQDNs for an optional regional + External HTTPS LoadBalancer with a self-signed + cert (e.g., appliance.example.com,admin.appliance.example.com). + Empty: skip the LB step. The HTTPRoutes must + already match these hostnames. Dependencies: go, qemu-system-x86_64, qemu-img, kubectl, ssh, ssh-keygen, curl, @@ -189,6 +196,14 @@ do_teardown() { if [[ $delete_data_disk -eq 1 && -n "$disk" ]]; then echo " Data disk: $GCP_DATADIR_DISK (PERSISTENT DATA WILL BE LOST)" fi + # If a TLS LB stack exists, do_tls_teardown will pick it up. + # We don't enumerate every resource here -- the function logs + # `deleting TLS LB stack ...` when it fires. + if gcloud compute forwarding-rules describe "${NAME}-tls-fr" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null | grep -q .; then + echo " TLS LB stack: ${NAME}-tls-* (forwarding rule + 8 dependents)" + fi echo echo "Will PRESERVE:" if [[ $delete_data_disk -eq 0 && -n "$disk" ]]; then @@ -230,9 +245,222 @@ do_teardown() { --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null fi + do_tls_teardown stage "teardown complete" } +# do_tls_frontend stands up a regional External Application +# Load Balancer in front of $VM_NAME with a self-signed cert +# covering $1 (comma-separated FQDNs). Idempotent: each create +# is describe-then-create, so re-runs converge. +# +# Resources are named ${NAME}-tls-* so do_tls_teardown can clean +# them up alongside the rest of the appliance. +# +# Cost: regional EXTERNAL_MANAGED LB forwarding rule (~hourly) +# + reserved IP (only while reserved). Both billed by the +# forwarding-rule-hour and the IP-hour respectively, so teardown +# stops the meter immediately. +do_tls_frontend() { + local domains_csv=$1 + local first_domain + first_domain=$(echo "$domains_csv" | cut -d, -f1) + local sans + sans="DNS:$(echo "$domains_csv" | sed 's/,/,DNS:/g')" + local cert_dir="$BUNDLE_DIR/tls" + mkdir -p "$cert_dir" + + stage "generating self-signed cert for $domains_csv (90 days)" + openssl req -x509 -newkey rsa:2048 -nodes \ + -keyout "$cert_dir/privkey.pem" -out "$cert_dir/fullchain.pem" \ + -days 90 -subj "/CN=$first_domain" \ + -addext "subjectAltName=$sans" 2>/dev/null + chmod 600 "$cert_dir/privkey.pem" + + # Proxy-only subnet: required by regional EXTERNAL_MANAGED LBs, + # one ACTIVE per region+VPC. Reuse if any exists; otherwise + # create a per-build one we can clean up on teardown. + stage "ensuring proxy-only subnet in $GCP_REGION" + if gcloud compute networks subnets list \ + --project="$GCP_PROJECT" \ + --filter "region:$GCP_REGION AND purpose=REGIONAL_MANAGED_PROXY AND role=ACTIVE" \ + --format='value(name)' 2>/dev/null | grep -q .; then + echo " reusing existing proxy-only subnet" + else + gcloud compute networks subnets create "${NAME}-tls-proxy-subnet" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --network=default --range=192.168.42.0/24 \ + --purpose=REGIONAL_MANAGED_PROXY --role=ACTIVE >/dev/null + fi + + stage "reserving regional external IP ${NAME}-tls-ip" + if ! gcloud compute addresses describe "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute addresses create "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --network-tier=STANDARD >/dev/null + fi + local lb_ip + lb_ip=$(gcloud compute addresses describe "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(address)') + + stage "uploading SSL cert ${NAME}-tls-cert" + if ! gcloud compute ssl-certificates describe "${NAME}-tls-cert" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute ssl-certificates create "${NAME}-tls-cert" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --certificate="$cert_dir/fullchain.pem" \ + --private-key="$cert_dir/privkey.pem" >/dev/null + fi + + stage "creating health check ${NAME}-tls-hc" + if ! gcloud compute health-checks describe "${NAME}-tls-hc" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute health-checks create http "${NAME}-tls-hc" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --port=80 --request-path=/q/envoy/echo \ + --check-interval=10s --timeout=5s >/dev/null + fi + + stage "creating network endpoint group ${NAME}-tls-neg" + if ! gcloud compute network-endpoint-groups describe "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + gcloud compute network-endpoint-groups create "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --network-endpoint-type=GCE_VM_IP_PORT --default-port=80 >/dev/null + gcloud compute network-endpoint-groups update "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --add-endpoint="instance=$VM_NAME,port=80" >/dev/null + fi + + stage "creating backend service ${NAME}-tls-backend" + if ! gcloud compute backend-services describe "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute backend-services create "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --load-balancing-scheme=EXTERNAL_MANAGED --protocol=HTTP \ + --health-checks="${NAME}-tls-hc" \ + --health-checks-region="$GCP_REGION" >/dev/null + gcloud compute backend-services add-backend "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --network-endpoint-group="${NAME}-tls-neg" \ + --network-endpoint-group-zone="$GCP_ZONE" \ + --balancing-mode=RATE --max-rate-per-endpoint=100 >/dev/null + fi + + stage "creating URL map ${NAME}-tls-urlmap" + if ! gcloud compute url-maps describe "${NAME}-tls-urlmap" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute url-maps create "${NAME}-tls-urlmap" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --default-service="projects/$GCP_PROJECT/regions/$GCP_REGION/backendServices/${NAME}-tls-backend" >/dev/null + fi + + stage "creating target HTTPS proxy ${NAME}-tls-proxy" + if ! gcloud compute target-https-proxies describe "${NAME}-tls-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute target-https-proxies create "${NAME}-tls-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --url-map="${NAME}-tls-urlmap" \ + --ssl-certificates="${NAME}-tls-cert" >/dev/null + fi + + stage "creating forwarding rule ${NAME}-tls-fr (:443)" + if ! gcloud compute forwarding-rules describe "${NAME}-tls-fr" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute forwarding-rules create "${NAME}-tls-fr" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --load-balancing-scheme=EXTERNAL_MANAGED --network-tier=STANDARD \ + --network=default --address="${NAME}-tls-ip" \ + --target-https-proxy="${NAME}-tls-proxy" \ + --target-https-proxy-region="$GCP_REGION" --ports=443 >/dev/null + fi + + cat </dev/null) || true # y-script-lint:disable=or-true # missing fr is not an error + proxy=$(gcloud compute target-https-proxies describe "${NAME}-tls-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing proxy is not an error + urlmap=$(gcloud compute url-maps describe "${NAME}-tls-urlmap" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing url-map is not an error + backend=$(gcloud compute backend-services describe "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing backend is not an error + neg=$(gcloud compute network-endpoint-groups describe "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing neg is not an error + hc=$(gcloud compute health-checks describe "${NAME}-tls-hc" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing hc is not an error + cert=$(gcloud compute ssl-certificates describe "${NAME}-tls-cert" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing cert is not an error + ip=$(gcloud compute addresses describe "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing ip is not an error + subnet=$(gcloud compute networks subnets describe "${NAME}-tls-proxy-subnet" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing subnet is not an error + + if [[ -z "$fr$proxy$urlmap$backend$neg$hc$cert$ip$subnet" ]]; then + return + fi + + stage "deleting TLS LB stack (${NAME}-tls-*)" + [[ -n "$fr" ]] && gcloud compute forwarding-rules delete "${NAME}-tls-fr" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$proxy" ]] && gcloud compute target-https-proxies delete "${NAME}-tls-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$urlmap" ]] && gcloud compute url-maps delete "${NAME}-tls-urlmap" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$backend" ]] && gcloud compute backend-services delete "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$neg" ]] && gcloud compute network-endpoint-groups delete "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + [[ -n "$hc" ]] && gcloud compute health-checks delete "${NAME}-tls-hc" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$cert" ]] && gcloud compute ssl-certificates delete "${NAME}-tls-cert" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$ip" ]] && gcloud compute addresses delete "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + # Subnet last: only delete the per-build one (do_tls_frontend + # never creates a subnet that already exists, so anything named + # ${NAME}-tls-proxy-subnet was definitely ours). + [[ -n "$subnet" ]] && gcloud compute networks subnets delete "${NAME}-tls-proxy-subnet" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null +} + # Minimal pre-checks shared by build and teardown: gcloud # binary + GCP key + activation. The build flow does # additional tool checks below the dispatch. @@ -640,6 +868,33 @@ stage "probing http://$PUBLIC_IP -- whatever you applied locally" probe echo "http://$PUBLIC_IP/q/envoy/echo" 30 || \ echo " (no echo route -- expected if your workload replaced y-cluster echo)" +# === Stage 11: optional external HTTPS LoadBalancer === +# Operator-driven add-on: if TLS_DOMAINS isn't set in the env, +# prompt for it (skip on empty input). With ASSUME_YES + TLS_DOMAINS +# set, runs without prompting. With ASSUME_YES alone, skip silently +# -- ASSUME_YES is for unattended e2e and we don't want to surprise +# the operator with a billing meter they didn't ask for. +if [[ -z "${TLS_DOMAINS:-}" && -z "${ASSUME_YES:-}" ]]; then + echo + echo "================================================================" + echo "Optional: external HTTPS LoadBalancer (regional, EXTERNAL_MANAGED)" + echo + echo "Sets up a regional GCP External Application Load Balancer in" + echo "front of $VM_NAME with a SELF-SIGNED cert covering the FQDNs" + echo "you specify. Useful for testing the LB+routing chain without" + echo "DNS or a real CA. Browsers will warn on the cert; tools need" + echo "--insecure / -k. Cost: ~hourly forwarding-rule + reserved IP." + echo + echo "HTTPRoutes on the cluster need spec.hostnames covering the" + echo "same FQDNs (the LB forwards Host: unchanged). Patch them" + echo "yourself before answering yes." + echo "================================================================" + read -r -p "FQDNs (comma-separated, empty to skip): " TLS_DOMAINS +fi +if [[ -n "${TLS_DOMAINS:-}" ]]; then + do_tls_frontend "$TLS_DOMAINS" +fi + if [[ -z "${KEEP_LOCAL:-}" ]]; then stage "tearing down local cluster (set KEEP_LOCAL=1 to keep it)" "$Y_CLUSTER" teardown -c "$CFG_DIR" 2>/dev/null || true # y-script-lint:disable=or-true # cleanup best-effort From dd9db67010dc74fa78701921a76875caf82b3006 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Tue, 5 May 2026 14:16:45 +0000 Subject: [PATCH 07/16] feat(scripts/qemu-to-gcp): :80 -> :443 LB redirect + Gateway-route probe Two related fixes for the GCP appliance smoke flow: 1. do_tls_frontend now creates a :80 forwarding rule that 301s to :443. Previously the function set up only a :443 listener, so any `curl http:///...` against the LB IP hung at TCP connect (no listener on 80). Hangs from `curl ... http://...` were diagnosed against the live ext-app01-* LB stack which has the same shape. Mechanism: GCP regional EXTERNAL_MANAGED URL maps can either forward (defaultService) or redirect (defaultUrlRedirect), not both, so the redirect needs its own URL map. The chain: :80 fwd -> tls-http-proxy -> tls-redirect URL map (301 to https) :443 fwd -> tls-proxy -> tls-urlmap (existing, ->backend) `gcloud compute url-maps create` has no flag for default- redirect, hence the `url-maps import` from a heredoc. Hostname-agnostic on both ports: every request, any Host:, either redirects (on :80) or forwards to the VM (on :443). The VM's envoy-gateway is the only Host-aware hop. do_tls_teardown grew matching delete calls in dependency order (forwarding rules -> proxies -> URL maps) so re-runs converge cleanly. 2. The post-deploy probe at the end of the GCP stage now enumerates HTTPRoute + GRPCRoute hostnames via SSH + `sudo k3s kubectl ... -o jsonpath` and probes each FQDN through `--resolve :80:$PUBLIC_IP`. Replaces the single-path `/q/envoy/echo` probe -- which only verified "envoy answers anything", not "every advertised route is reachable end-to-end". Reachability == any HTTP status code (2xx/3xx/4xx/5xx), not 200: a route that legitimately answers 302 / 401 / 404 is still proof the firewall + klipper-lb + envoy-gateway chain is working. Only `000` (timeout / refused) counts as unreachable. On any unreachable route the script logs a warning with diagnostic suggestions (firewall source-ranges narrowed, backend Service not Ready, workload still rolling out) and continues -- info-level surfacing today, gating / strict mode is a deliberately deferred follow-up. Falls back to the old `/q/envoy/echo` probe when the cluster has no Gateway-bound routes (a workload that hasn't applied yet). Verified end-to-end against the live appliance: 4 routes enumerated (dev.yolean.net, ext-app01.yolean.se, keycloak-admin, keycloak-admin.ext-app01.yolean.se), all returned HTTP 302 on the first attempt. The redirect chain itself is intentionally NOT exercised against ext-app01-* in this commit (would require mutating an in-use LB the operator owns); it lands on the next do_tls_frontend run. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-qemu-to-gcp.sh | 131 ++++++++++++++++++++++++++++--- 1 file changed, 122 insertions(+), 9 deletions(-) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 885ba6c..a8ba89e 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -377,6 +377,47 @@ do_tls_frontend() { --target-https-proxy-region="$GCP_REGION" --ports=443 >/dev/null fi + # === HTTP -> HTTPS redirect chain === + # GCP regional EXTERNAL_MANAGED URL maps can do a default redirect + # but `gcloud compute url-maps create` has no flag for it -- we + # have to import a YAML body. A URL map can have either + # `defaultService` (forward) or `defaultUrlRedirect` (redirect), + # not both, hence the second URL map + second target proxy + second + # forwarding rule sharing the same reserved IP. + stage "creating redirect URL map ${NAME}-tls-redirect (HTTP -> HTTPS)" + if ! gcloud compute url-maps describe "${NAME}-tls-redirect" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute url-maps import "${NAME}-tls-redirect" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --source=- --quiet >/dev/null </dev/null 2>&1; then + gcloud compute target-http-proxies create "${NAME}-tls-http-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --url-map="${NAME}-tls-redirect" \ + --url-map-region="$GCP_REGION" >/dev/null + fi + + stage "creating forwarding rule ${NAME}-tls-fr-http (:80 -> redirect)" + if ! gcloud compute forwarding-rules describe "${NAME}-tls-fr-http" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute forwarding-rules create "${NAME}-tls-fr-http" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --load-balancing-scheme=EXTERNAL_MANAGED --network-tier=STANDARD \ + --network=default --address="${NAME}-tls-ip" \ + --target-http-proxy="${NAME}-tls-http-proxy" \ + --target-http-proxy-region="$GCP_REGION" --ports=80 >/dev/null + fi + cat < 301 redirect to :443 (so plain http:// works + as long as the client follows redirects, e.g. curl -L) To test from another machine, append this single line to /etc/hosts: @@ -404,16 +447,25 @@ EOF # the forwarding rule has to go before the proxy/url-map/backend # chain, and the IP after. do_tls_teardown() { - local fr proxy urlmap backend neg hc cert ip subnet + local fr fr_http proxy http_proxy urlmap urlmap_redirect backend neg hc cert ip subnet fr=$(gcloud compute forwarding-rules describe "${NAME}-tls-fr" \ --project="$GCP_PROJECT" --region="$GCP_REGION" \ --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing fr is not an error + fr_http=$(gcloud compute forwarding-rules describe "${NAME}-tls-fr-http" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing :80 redirect fr is not an error proxy=$(gcloud compute target-https-proxies describe "${NAME}-tls-proxy" \ --project="$GCP_PROJECT" --region="$GCP_REGION" \ --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing proxy is not an error + http_proxy=$(gcloud compute target-http-proxies describe "${NAME}-tls-http-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing :80 redirect proxy is not an error urlmap=$(gcloud compute url-maps describe "${NAME}-tls-urlmap" \ --project="$GCP_PROJECT" --region="$GCP_REGION" \ --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing url-map is not an error + urlmap_redirect=$(gcloud compute url-maps describe "${NAME}-tls-redirect" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing redirect url-map is not an error backend=$(gcloud compute backend-services describe "${NAME}-tls-backend" \ --project="$GCP_PROJECT" --region="$GCP_REGION" \ --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing backend is not an error @@ -433,17 +485,27 @@ do_tls_teardown() { --project="$GCP_PROJECT" --region="$GCP_REGION" \ --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing subnet is not an error - if [[ -z "$fr$proxy$urlmap$backend$neg$hc$cert$ip$subnet" ]]; then + if [[ -z "$fr$fr_http$proxy$http_proxy$urlmap$urlmap_redirect$backend$neg$hc$cert$ip$subnet" ]]; then return fi stage "deleting TLS LB stack (${NAME}-tls-*)" + # Forwarding rules first (they reference proxies) -- both :443 + # and the :80 redirect. [[ -n "$fr" ]] && gcloud compute forwarding-rules delete "${NAME}-tls-fr" \ --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$fr_http" ]] && gcloud compute forwarding-rules delete "${NAME}-tls-fr-http" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + # Then proxies (they reference URL maps). [[ -n "$proxy" ]] && gcloud compute target-https-proxies delete "${NAME}-tls-proxy" \ --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$http_proxy" ]] && gcloud compute target-http-proxies delete "${NAME}-tls-http-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + # Then URL maps (the :443 backend-pointing one + the :80 redirect one). [[ -n "$urlmap" ]] && gcloud compute url-maps delete "${NAME}-tls-urlmap" \ --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$urlmap_redirect" ]] && gcloud compute url-maps delete "${NAME}-tls-redirect" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null [[ -n "$backend" ]] && gcloud compute backend-services delete "${NAME}-tls-backend" \ --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null [[ -n "$neg" ]] && gcloud compute network-endpoint-groups delete "${NAME}-tls-neg" \ @@ -860,13 +922,64 @@ probe() { return 1 } -stage "probing http://$PUBLIC_IP -- whatever you applied locally" -# We don't know the operator's routes a priori; try the -# y-cluster-shipped echo path as a baseline. If their workload -# replaced echo, this fails and the operator curls their own -# route. -probe echo "http://$PUBLIC_IP/q/envoy/echo" 30 || \ - echo " (no echo route -- expected if your workload replaced y-cluster echo)" +# probe_route checks one Gateway-bound FQDN through PUBLIC_IP via +# `--resolve :80:`. Reachability == any HTTP response, +# not 200: an HTTPRoute that legitimately answers 302 / 401 / 404 +# is still proof the firewall + klipper-lb + envoy-gateway chain +# is working end-to-end. Only `000` (timeout, refused) counts as +# unreachable. +probe_route() { + local fqdn=$1 attempts=${2:-30} + local code + for i in $(seq 1 "$attempts"); do + code=$(curl -sS -o /dev/null -m 5 \ + --resolve "$fqdn:80:$PUBLIC_IP" \ + -w '%{http_code}' "http://$fqdn/" 2>/dev/null \ + || echo 000) + if [[ "$code" != "000" ]]; then + printf ' %-40s HTTP %s (attempt %d)\n' "$fqdn" "$code" "$i" + return 0 + fi + echo " $fqdn attempt $i/$attempts: no answer yet" + sleep 10 + done + return 1 +} + +stage "enumerating Gateway routes on the appliance" +# Walk HTTPRoutes + GRPCRoutes for spec.hostnames. Each unique FQDN +# becomes a probe target. We use SSH + `sudo k3s kubectl` rather +# than extracting the kubeconfig because the apiserver isn't yet +# externally exposed at this point in the script (the kubeconfig +# extract recipe at the bottom of the success summary is for the +# operator after teardown of the local cluster). +ROUTE_HOSTS=$(ssh $SSH_OPTS ystack@"$PUBLIC_IP" \ + 'sudo k3s kubectl get httproute,grpcroute -A -o jsonpath="{range .items[*]}{range .spec.hostnames[*]}{@}{\"\n\"}{end}{end}"' \ + 2>/dev/null | sort -u) + +if [[ -z "$ROUTE_HOSTS" ]]; then + echo " no Gateway-bound HTTPRoutes/GRPCRoutes; falling back to echo probe" + probe echo "http://$PUBLIC_IP/q/envoy/echo" 30 || \ + echo " (echo also unreachable; cluster still booting?)" +else + stage "probing each Gateway route via $PUBLIC_IP" + fail_count=0 + fail_list="" + while IFS= read -r fqdn; do + if ! probe_route "$fqdn" 30; then + fail_count=$((fail_count + 1)) + fail_list="$fail_list $fqdn" + fi + done <<<"$ROUTE_HOSTS" + if [[ $fail_count -gt 0 ]]; then + echo " WARNING: $fail_count route(s) unreachable after 5min:$fail_list" + echo " Possible causes:" + echo " - firewall y-cluster-appliance-public source-ranges narrowed (check" + echo " \`gcloud compute firewall-rules describe y-cluster-appliance-public\`)" + echo " - HTTPRoute attached but backend Service not Ready" + echo " - workload still rolling out (re-run \`probe_route \` later)" + fi +fi # === Stage 11: optional external HTTPS LoadBalancer === # Operator-driven add-on: if TLS_DOMAINS isn't set in the env, From 475d6779e0db0b8ab3c33bbe3aa37c0156000a39 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Wed, 6 May 2026 07:17:47 +0000 Subject: [PATCH 08/16] feat(scripts/qemu-to-gcp): explicit --reuse-disk / --keep-disk; teardown-side PRESERVED message State preservation across appliance redeploys is the overarching design goal of the data-seed mechanism (commit f69addf + APPLIANCE_MAINTENANCE.md). What was missing on the operator- facing side: the QA-flow build script silently reused the persistent disk on every redeploy, masking the seed-skip from build-time-only operators who expected each run to validate the seed end-to-end. Conversely, the production "preserve customer state across upgrades" intent was never written down where it mattered (the operator only saw a generic banner at deploy time, not after teardown when the disk-keep decision is most actionable). Changes: - Build-flow `--reuse-disk=true|false` with an interactive prompt (default Y -- preserve, matching the design goal). On `--reuse-disk=false` the script delete-and-recreates the persistent disk so the next boot's data-seed unit lands the OS image's seed cleanly. Non-TTY callers MUST pass the flag explicitly; ASSUME_YES + missing flag fails fast rather than silently picking a default for an irreversible decision. - Teardown `--keep-disk=true|false`. Default behavior is unchanged (keep). Legacy `--delete-data-disk` continues to work as `--keep-disk=false` with a one-line deprecation notice, so any existing automation isn't broken. - Decoupled the new disk decisions from the existing `confirm()` helper (which consults ASSUME_YES). New `prompt_yes_default()` helper requires a TTY or an explicit flag, never falls back to ASSUME_YES. The umbrella ASSUME_YES still covers the existing 'Proceed?' + TLS-LB prompts. - Moved the "Persistent data disk PRESERVED" message from the build-success banner to the END of teardown when the disk was kept. That's the moment the operator's mental model needs the reminder ('what survived?' + 'how do I delete it later?'). The build success block keeps a brief one-line pointer to teardown's message instead of carrying the full paragraph. Verified end-to-end against yo-sre-appliance-qa over the past two days: --reuse-disk=false correctly recreates the disk and the data-seed unit extracts the image's seed onto it; the recreated disk + grastate.dat workaround round-tripped mariadb's keycloak.REALM rows through prepare-export -> seed -> fresh-disk -> boot, with `keycloak/auth/realms/ext-bfv01` returning 200 from the resulting cluster. Two follow-up fixes lined up but not in this commit (kept working-tree, separate commit): a `return 0` belt at the end of do_tls_teardown so its trailing `[[ -n "$subnet" ]] && ...` doesn't leak a non-zero exit and abort the caller before the new PRESERVED block fires; and the revert of the route- enumeration block that this same teardown-issue debugging surfaced as post-import SSH+kubectl scope-creep. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-qemu-to-gcp.sh | 263 ++++++++++++++++++++++--------- 1 file changed, 186 insertions(+), 77 deletions(-) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index a8ba89e..e66e95e 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -39,20 +39,30 @@ set -eo pipefail YHELP='appliance-qemu-to-gcp.sh - local provision -> hands-on -> export -> ship to GCP Usage: - appliance-qemu-to-gcp.sh build + ship to GCP - appliance-qemu-to-gcp.sh teardown delete VM + image + GCS object - appliance-qemu-to-gcp.sh teardown --delete-data-disk - also delete the persistent - /data/yolean disk (DESTRUCTIVE) + appliance-qemu-to-gcp.sh [--reuse-disk=true|false] build + ship to GCP + appliance-qemu-to-gcp.sh teardown [--keep-disk=true|false] + delete VM + image + GCS object; + persistent disk preserved by + default (state-preservation + is the appliance design goal) + +Build flow disk handling: + --reuse-disk=true reuse existing /data/yolean disk (preserves customer state + across redeploys; the build image seed is no-op against an + already-seeded marker -- this is the production upgrade path) + --reuse-disk=false delete + recreate the disk (fresh disk lets the build image + seed extract; use this for QA / end-to-end seed validation) + (no flag, TTY) interactive prompt with default Y (reuse) + (no flag, no TTY) error: explicit choice required for non-interactive runs Teardown reads GCP_PROJECT / GCP_ZONE / GCP_BUCKET / VM_NAME / GCP_DATADIR_DISK / NAME from the same env vars as the build flow. Custom images and GCS objects are deleted by NAME prefix (so different NAMEs in the same project do not clobber each other). The persistent data disk, the bucket itself, and the -firewall rule are preserved unless --delete-data-disk is set. -Local cluster cleanup (if KEEP_LOCAL was set) is separate: -y-cluster teardown -c \$CFG_DIR. +firewall rule are preserved unless --keep-disk=false is set +(legacy alias: --delete-data-disk). Local cluster cleanup (if +KEEP_LOCAL was set) is separate: y-cluster teardown -c \$CFG_DIR. Environment: GCP_PROJECT GCP project (set in .env or shell env; required) @@ -148,6 +158,27 @@ confirm() { esac } +# prompt_yes_default is for irreversible decisions where the +# default-on-Enter is YES (state preservation, design-goal aligned) +# but the operator must EXPLICITLY pre-answer for non-interactive +# runs. Distinct from confirm() in two ways: +# - default is Y, not N +# - ASSUME_YES is NOT consulted; non-TTY callers without an +# explicit flag get a clear error instead of a silent default. +# Returns 0 for yes, 1 for no, exits 2 on no-TTY-no-flag. +prompt_yes_default() { + local prompt=$1 missing_flag_hint=$2 + if [[ ! -t 0 ]]; then + echo "non-interactive shell: $missing_flag_hint" >&2 + exit 2 + fi + read -r -p "$prompt [Y/n] " answer + case "${answer,,}" in + n|no) return 1 ;; + *) return 0 ;; + esac +} + # do_teardown deletes GCP resources owned by this script's # NAME prefix in the configured project + zone. Reads the # same env vars as the build flow so a teardown after a @@ -155,10 +186,18 @@ confirm() { # exactly that customer's resources without touching other # NAMEs that share the same project. do_teardown() { + # delete_data_disk: 0=keep (default; design-goal state preservation), + # 1=delete. --keep-disk=true|false is the explicit form; + # --delete-data-disk is the legacy alias mapped to --keep-disk=false. local delete_data_disk=0 while [[ $# -gt 0 ]]; do case "$1" in - --delete-data-disk) delete_data_disk=1 ;; + --keep-disk=true) delete_data_disk=0 ;; + --keep-disk=false) delete_data_disk=1 ;; + --delete-data-disk) + echo " note: --delete-data-disk is deprecated; prefer --keep-disk=false" + delete_data_disk=1 + ;; *) echo "unknown teardown flag: $1" >&2; exit 2 ;; esac shift @@ -207,7 +246,7 @@ do_teardown() { echo echo "Will PRESERVE:" if [[ $delete_data_disk -eq 0 && -n "$disk" ]]; then - echo " Data disk: $GCP_DATADIR_DISK (--delete-data-disk to also remove)" + echo " Data disk: $GCP_DATADIR_DISK (--keep-disk=false to also remove)" fi echo " GCS bucket: gs://$GCP_BUCKET (objects matching $NAME-* deleted above)" echo " Firewall rule: y-cluster-appliance-public (tag-based, shared)" @@ -247,6 +286,26 @@ do_teardown() { do_tls_teardown stage "teardown complete" + + # Surface the preservation contract at the moment the operator + # is about to step away. Previously this only appeared in the + # build-flow success block, where it was less actionable. + if [[ $delete_data_disk -eq 0 && -n "$disk" ]]; then + cat </dev/null + # Force a 0 return: the [[ -n "$subnet" ]] && ... pattern above + # returns 1 when $subnet is empty (subnet was reused, not + # created by this run). Without this, set -e in the caller + # treats the function as failed and aborts before the + # "teardown complete" stage + the PRESERVED message can fire. + return 0 } # Minimal pre-checks shared by build and teardown: gcloud @@ -555,6 +620,34 @@ if [[ "${1:-}" = "teardown" ]]; then exit 0 fi +# Build-flow arg parsing. Today: just --reuse-disk=true|false. +# Empty REUSE_DISK + interactive prompt at Stage 8.5 if the disk +# already exists; non-interactive runs without the flag error +# out at Stage 8.5 with a clear message. +REUSE_DISK="" +while [[ $# -gt 0 ]]; do + case "$1" in + --reuse-disk=true) REUSE_DISK=true ;; + --reuse-disk=false) REUSE_DISK=false ;; + *) echo "unknown build flag: $1" >&2; exit 2 ;; + esac + shift +done + +# Fail early if ASSUME_YES is set without an explicit disk +# decision: ASSUME_YES means "unattended; don't prompt me", and +# disk handling is irreversible (--reuse-disk=false destroys +# customer state). Don't let that slip through to a prompt at +# Stage 8.5 that would either hang (TTY-less) or be answered +# by a default the operator never deliberately picked. +if [[ -n "${ASSUME_YES:-}" && -z "$REUSE_DISK" ]]; then + echo "ASSUME_YES set without --reuse-disk=true|false: refusing to" >&2 + echo "guess at an irreversible decision. Pick one explicitly:" >&2 + echo " --reuse-disk=true preserve customer state (production upgrade)" >&2 + echo " --reuse-disk=false delete + recreate (QA seed validation)" >&2 + exit 2 +fi + # Build-flow tool check (additional to gcloud above). for tool in go qemu-system-x86_64 qemu-img kubectl ssh ssh-keygen curl virt-sysprep; do command -v "$tool" >/dev/null \ @@ -771,10 +864,71 @@ fi # fresh image, the same /data/yolean comes back. Disk auto-delete # is OFF when attaching an existing disk via --disk=name=, so # `instances delete` won't wipe it. +# +# Reuse decision (when the disk already exists): +# --reuse-disk=true reuse silently (preserves customer state -- the +# production upgrade path; data-seed unit no-ops +# on the existing marker, so the new image's seed +# is correctly NOT applied) +# --reuse-disk=false delete + recreate (fresh disk lets the build +# image's seed extract -- the QA validation path) +# no flag, TTY interactive prompt, default Y (reuse) +# no flag, no TTY error + exit (explicit choice required for +# irreversible decisions in unattended runs) stage "ensuring persistent data disk $GCP_DATADIR_DISK (size only used on create: $GCP_DATADIR_SIZE)" if gcloud compute disks describe "$GCP_DATADIR_DISK" \ --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then - echo " disk exists -- reusing (data preserved from previous deploy)" + case "$REUSE_DISK" in + true) + echo " disk exists -- reusing (--reuse-disk=true; preserves customer state)" + ;; + false) + echo " disk exists -- --reuse-disk=false: deleting and recreating" + # Detach by deleting the VM first if it's still attached. + # Stage 9 normally handles VM deletion for idempotency; we + # do it here too because gcloud compute disks delete refuses + # while the disk is mounted on a running instance. + if gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + echo " $VM_NAME exists, deleting first to release disk" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + fi + gcloud compute disks delete "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + gcloud compute disks create "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --size="$GCP_DATADIR_SIZE" \ + --type=pd-balanced \ + >/dev/null + echo " disk recreated (fresh; will be ext4-formatted on first mount)" + ;; + *) + if prompt_yes_default \ + " Reuse existing data disk $GCP_DATADIR_DISK with its preserved state?" \ + "pass --reuse-disk=true (preserve state) or --reuse-disk=false (delete + reseed)"; then + echo " reusing (preserves customer state)" + else + echo " --reuse-disk=false chosen: deleting and recreating" + if gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + echo " $VM_NAME exists, deleting first to release disk" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + fi + gcloud compute disks delete "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + gcloud compute disks create "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --size="$GCP_DATADIR_SIZE" \ + --type=pd-balanced \ + >/dev/null + echo " disk recreated (fresh; will be ext4-formatted on first mount)" + fi + ;; + esac else gcloud compute disks create "$GCP_DATADIR_DISK" \ --project="$GCP_PROJECT" \ @@ -922,64 +1076,23 @@ probe() { return 1 } -# probe_route checks one Gateway-bound FQDN through PUBLIC_IP via -# `--resolve :80:`. Reachability == any HTTP response, -# not 200: an HTTPRoute that legitimately answers 302 / 401 / 404 -# is still proof the firewall + klipper-lb + envoy-gateway chain -# is working end-to-end. Only `000` (timeout, refused) counts as -# unreachable. -probe_route() { - local fqdn=$1 attempts=${2:-30} - local code - for i in $(seq 1 "$attempts"); do - code=$(curl -sS -o /dev/null -m 5 \ - --resolve "$fqdn:80:$PUBLIC_IP" \ - -w '%{http_code}' "http://$fqdn/" 2>/dev/null \ - || echo 000) - if [[ "$code" != "000" ]]; then - printf ' %-40s HTTP %s (attempt %d)\n' "$fqdn" "$code" "$i" - return 0 - fi - echo " $fqdn attempt $i/$attempts: no answer yet" - sleep 10 - done - return 1 -} - -stage "enumerating Gateway routes on the appliance" -# Walk HTTPRoutes + GRPCRoutes for spec.hostnames. Each unique FQDN -# becomes a probe target. We use SSH + `sudo k3s kubectl` rather -# than extracting the kubeconfig because the apiserver isn't yet -# externally exposed at this point in the script (the kubeconfig -# extract recipe at the bottom of the success summary is for the -# operator after teardown of the local cluster). -ROUTE_HOSTS=$(ssh $SSH_OPTS ystack@"$PUBLIC_IP" \ - 'sudo k3s kubectl get httproute,grpcroute -A -o jsonpath="{range .items[*]}{range .spec.hostnames[*]}{@}{\"\n\"}{end}{end}"' \ - 2>/dev/null | sort -u) - -if [[ -z "$ROUTE_HOSTS" ]]; then - echo " no Gateway-bound HTTPRoutes/GRPCRoutes; falling back to echo probe" - probe echo "http://$PUBLIC_IP/q/envoy/echo" 30 || \ - echo " (echo also unreachable; cluster still booting?)" -else - stage "probing each Gateway route via $PUBLIC_IP" - fail_count=0 - fail_list="" - while IFS= read -r fqdn; do - if ! probe_route "$fqdn" 30; then - fail_count=$((fail_count + 1)) - fail_list="$fail_list $fqdn" - fi - done <<<"$ROUTE_HOSTS" - if [[ $fail_count -gt 0 ]]; then - echo " WARNING: $fail_count route(s) unreachable after 5min:$fail_list" - echo " Possible causes:" - echo " - firewall y-cluster-appliance-public source-ranges narrowed (check" - echo " \`gcloud compute firewall-rules describe y-cluster-appliance-public\`)" - echo " - HTTPRoute attached but backend Service not Ready" - echo " - workload still rolling out (re-run \`probe_route \` later)" - fi -fi +stage "probing http://$PUBLIC_IP -- whatever you applied locally" +# We don't know the operator's routes a priori; try the +# y-cluster-shipped echo path as a baseline. If their workload +# replaced echo, this fails and the operator curls their own +# route. +# +# We deliberately do NOT enumerate Gateway routes via SSH+kubectl +# here. Post-import scripting that uses maintainer-only paths +# (SSH keys + kubectl access) is a smell: the customer's hosting +# can't replicate it, and tying the build flow to the api-server +# warm-up period creates spurious halts on transient kubectl +# failures (we hit one of those during this script's lifetime -- +# the silent abort right after `systemctl restart k3s.service`). +# Any per-route probing belongs in the consumer-side validate +# script which knows its own routes from its own kustomize. +probe echo "http://$PUBLIC_IP/q/envoy/echo" 30 || \ + echo " (no echo route -- expected if your workload replaced y-cluster echo)" # === Stage 11: optional external HTTPS LoadBalancer === # Operator-driven add-on: if TLS_DOMAINS isn't set in the env, @@ -1041,13 +1154,9 @@ kubectl from your laptop (apiserver not externally exposed): KUBECONFIG=k3s-$VM_NAME.yaml kubectl get nodes Teardown when done: - gcloud compute instances delete $VM_NAME --project=$GCP_PROJECT --zone=$GCP_ZONE - gcloud compute images delete $IMAGE_NAME --project=$GCP_PROJECT - gcloud storage rm gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz --project=$GCP_PROJECT - -Persistent data disk is PRESERVED on teardown so PVC data -survives across redeploys. Re-running this script reuses the -same /data/yolean. Delete it manually when you're truly done: - gcloud compute disks delete $GCP_DATADIR_DISK --project=$GCP_PROJECT --zone=$GCP_ZONE + $0 teardown + (preserves /data/yolean by default; pass --keep-disk=false to also + delete the persistent disk. Teardown's exit message lists the + disk + recommended delete command for later cleanup.) ================================================================ EOF From 77bdca288097e2fe407bccb35470dec09fb329cc Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 7 May 2026 04:03:00 +0000 Subject: [PATCH 09/16] chore(scripts/qemu-to-gcp): bump build VM memory 4096 -> 8192 The build VM occasionally OOMs during heavier customer workloads applied at PROMPT 1 (mariadb + kafka + envoy + the bundled controllers all in 4GB is tight). 8GB matches the y-cluster default for stand-alone provisions but the qemu-to-gcp script was overriding it down to 4GB to keep the host's headroom; the headroom is fine on the build host, so lift the override. The y-cluster default itself is unchanged (8192 in config.QEMUConfig.applyDefaults), so other provisioner flows (multipass, docker, plain qemu) are not affected. Disk size stays at 40GB. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-qemu-to-gcp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index e66e95e..660961a 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -681,7 +681,7 @@ mkdir -p "$CFG_DIR" echo "name: $NAME" echo "context: $KUBECTX" [ -n "${APP_SSH_PORT:-}" ] && printf 'sshPort: "%s"\n' "$APP_SSH_PORT" - echo 'memory: "4096"' + echo 'memory: "8192"' echo 'cpus: "2"' echo 'diskSize: "40G"' if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then From 6c2e3fd0722eec7a88ce51b8dea0883f28de9acb Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 7 May 2026 04:19:25 +0000 Subject: [PATCH 10/16] fix(scripts/qemu-to-gcp): drop explicit stop before prepare-export PR #20 changed prepare-export to require the cluster RUNNING: its live phase clears the per-deploy dns-hint-ip annotation and snapshots reconciled Gateway state into /- gateway-state.json (both need the apiserver up). prepare-export then stops the VM itself before its offline (virt-customize) phase. The plan called for dropping `y-cluster stop` from the script ahead of prepare-export, but the script edit never landed. The result: every run of appliance-qemu-to-gcp.sh would stop the cluster, then crash with "VM not running; start the cluster first" when prepare-export ran against the stopped VM. Drop the explicit stop call. Update the docstring stage list to reflect that prepare-export does its own stop. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-qemu-to-gcp.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 660961a..08b1cea 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -9,7 +9,9 @@ # operator applies their custom workloads via kubectl / # yconverge against context $NAME, tests them, and # confirms when satisfied. -# 3. y-cluster stop -> y-cluster prepare-export (virt-sysprep +# 3. y-cluster prepare-export (snapshots reconciled Gateway +# state, clears the dns-hint-ip annotation, then stops the +# VM internally and runs the offline phase: virt-sysprep # identity reset + timesyncd flip + netplan generic match). # 4. y-cluster export --format=gcp-tar -- packs the qcow2 # into .tar.gz containing a single disk.raw, the @@ -770,10 +772,13 @@ EOF confirm "Proceed to export + GCP deploy?" \ || { echo "aborted; local cluster left running. Teardown with: $Y_CLUSTER teardown -c $CFG_DIR"; exit 0; } -# === Stage 3: stop + prepare-export + export gcp-tar === -stage "stopping cluster ($NAME)" -"$Y_CLUSTER" stop --context="$KUBECTX" - +# === Stage 3: prepare-export + export gcp-tar === +# prepare-export needs the cluster RUNNING: its live phase +# clears the per-deploy yolean.se/dns-hint-ip annotation and +# snapshots reconciled Gateway state into /- +# gateway-state.json. It then stops the VM itself before the +# offline phase (libguestfs needs the disk not in use). +# Calling `y-cluster stop` here would defeat that. stage "prepare-export ($NAME)" "$Y_CLUSTER" prepare-export --context="$KUBECTX" From 6b0c5d20578dcde562d3aadfce031b648863b12a Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 7 May 2026 05:46:05 +0000 Subject: [PATCH 11/16] feat(scripts/qemu-to-gcp): TLS_DOMAINS=auto derives from gateway state Drops the parallel-list footgun: today the operator declares hostnames in HTTPRoute manifests AND in TLS_DOMAINS, and drift between the two means the LB cert covers hostnames the cluster doesn't serve, or vice versa. Setting TLS_DOMAINS=auto now resolves the FQDN list by calling `y-cluster gateway hostnames --csv` against the just-provisioned cluster, immediately after PROMPT 1 confirmation. The cluster's reconciled HTTPRoute / GRPCRoute hostnames become the LB cert SAN list -- one source of truth. Resolution runs BEFORE prepare-export because by the TLS LB stage (after prepare-export + GCP deploy) the local apiserver is gone. Other TLS_DOMAINS values (literal CSV / empty / prompt) are still handled at the LB stage as before. Empty result aborts with an explicit error (operator asked for auto, none found = something wrong with the cluster state). Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-qemu-to-gcp.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 08b1cea..440a0e8 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -102,6 +102,13 @@ Environment: cert (e.g., appliance.example.com,admin.appliance.example.com). Empty: skip the LB step. The HTTPRoutes must already match these hostnames. + Special value "auto": derive the FQDN list from + `y-cluster gateway hostnames --csv` against the + just-provisioned cluster -- reconciled + HTTPRoute / GRPCRoute hostnames become the LB + cert SAN list, so the two can never drift. + Aborts with an error when "auto" is set but the + cluster has no non-wildcard hostnames yet. Dependencies: go, qemu-system-x86_64, qemu-img, kubectl, ssh, ssh-keygen, curl, @@ -772,6 +779,24 @@ EOF confirm "Proceed to export + GCP deploy?" \ || { echo "aborted; local cluster left running. Teardown with: $Y_CLUSTER teardown -c $CFG_DIR"; exit 0; } +# Resolve TLS_DOMAINS=auto against the LIVE cluster while the +# apiserver is still up. By the time we reach the TLS LB stage +# (after prepare-export and the GCP deploy), the local cluster +# is gone and `gateway hostnames` would have nothing to read. +# Other TLS_DOMAINS values (literal CSV / empty / prompt) are +# handled at the LB stage itself; only "auto" needs the live +# cluster query here. +if [[ "${TLS_DOMAINS:-}" == "auto" ]]; then + stage "deriving TLS_DOMAINS from gateway state" + TLS_DOMAINS=$("$Y_CLUSTER" gateway hostnames --context="$KUBECTX" --csv) + [[ -n "$TLS_DOMAINS" ]] || { + echo "ERROR: TLS_DOMAINS=auto but the cluster's gateway state has no non-wildcard hostnames." >&2 + echo " Apply HTTPRoutes with .spec.hostnames first, or set TLS_DOMAINS=foo,bar to override." >&2 + exit 1 + } + echo " TLS_DOMAINS=$TLS_DOMAINS" +fi + # === Stage 3: prepare-export + export gcp-tar === # prepare-export needs the cluster RUNNING: its live phase # clears the per-deploy yolean.se/dns-hint-ip annotation and From 900ff6a07b8813cb1114992c53851d9e0bbd776c Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 7 May 2026 06:54:49 +0000 Subject: [PATCH 12/16] feat(scripts/qemu-to-gcp): APPLIANCE_SEED_CMD + APPLIANCE_VERIFY_CMD hooks The unattended flow had ASSUME_YES + TLS_DOMAINS=auto landed already, but no work-doing hook in PROMPT 1's hands-on window. Result: a build with ASSUME_YES=1 reached prepare-export with only the y-cluster echo HTTPRoute applied; TLS_DOMAINS=auto then aborted because the cluster had no non-wildcard hostnames to derive from. Add the two hooks documented in specs/y-cluster/FEATURE_APPLIANCE_AUTOMATED_FLOW.md: - APPLIANCE_SEED_CMD runs after echo install, before PROMPT 1. Customer workloads applied here populate /data/yolean for the data-seed extraction AND give TLS_DOMAINS=auto real hostnames. - APPLIANCE_VERIFY_CMD runs at the end, after the GCP deploy + optional TLS LB. Receives the LB IP / VM IP / domains via the Y_CLUSTER_CURRENT_* surface so a remote probe can curl --resolve through the deployed VM without /etc/hosts. Both fire via `bash -c "$cmd"` so the operator-supplied string can pipe / chain / cd freely. Both export a single, consistent Y_CLUSTER_CURRENT_* env surface (via the new current_env helper) -- a verify script `printenv | grep ^Y_CLUSTER_CURRENT_` sees the full surface either way; vars not yet known at the seed hook (REMOTE_VM_IP, etc.) are exported as empty strings. Non-zero exit aborts under set -e. Local cluster / VM / LB stay up for inspection. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/appliance-qemu-to-gcp.sh | 82 ++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 440a0e8..05fa44d 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -97,6 +97,21 @@ Environment: ASSUME_YES Skip BOTH confirmations and proceed end-to-end. Also suppresses the optional TLS-LB prompt; set TLS_DOMAINS alongside to opt in unattended. + APPLIANCE_SEED_CMD Optional shell cmd to run after echo install, + before PROMPT 1. Receives the + Y_CLUSTER_CURRENT_* env surface (build-side + ports, kubeconfig context, ssh-key path). + Typical: cd into a customer repo and apply + kustomize bases that populate /data/yolean. + Non-zero exit aborts; local cluster left up. + APPLIANCE_VERIFY_CMD Optional shell cmd to run after the GCP + deploy + optional TLS LB, before final + summary. Receives the same Y_CLUSTER_CURRENT_* + surface plus REMOTE_VM_IP / REMOTE_LB_IP / + REMOTE_DOMAINS / REMOTE_SCHEME so a remote + probe can curl --resolve through the + deployed VM. Non-zero exit aborts; VM and + LB stay up for inspection. TLS_DOMAINS Comma-separated FQDNs for an optional regional External HTTPS LoadBalancer with a self-signed cert (e.g., appliance.example.com,admin.appliance.example.com). @@ -188,6 +203,42 @@ prompt_yes_default() { esac } +# current_env exports the Y_CLUSTER_CURRENT_* surface a hook +# cmd reads via printenv. Call right before invoking the cmd +# so any vars computed since the last invocation (PUBLIC_IP, +# BUNDLE_DIR, TLS-LB IP) are picked up. Vars not yet known at +# the call site are exported as empty strings (not unset) so +# a verify script can read them unconditionally. +current_env() { + export Y_CLUSTER_CURRENT_NAME="$NAME" + export Y_CLUSTER_CURRENT_KUBECTX="$KUBECTX" + export Y_CLUSTER_CURRENT_LOCAL_HTTP_PORT="${APP_HTTP_PORT:-80}" + export Y_CLUSTER_CURRENT_LOCAL_HTTPS_PORT="${APP_HTTPS_PORT:-443}" + export Y_CLUSTER_CURRENT_LOCAL_API_PORT="${APP_API_PORT:-6443}" + export Y_CLUSTER_CURRENT_LOCAL_SSH_PORT="${APP_SSH_PORT:-2222}" + export Y_CLUSTER_CURRENT_LOCAL_SSH_KEY="${CACHE_DIR:-}/${NAME}-ssh" + export Y_CLUSTER_CURRENT_BUNDLE_DIR="${BUNDLE_DIR:-}" + export Y_CLUSTER_CURRENT_REMOTE_VM_NAME="${VM_NAME:-}" + export Y_CLUSTER_CURRENT_REMOTE_VM_IP="${PUBLIC_IP:-}" + export Y_CLUSTER_CURRENT_REMOTE_DOMAINS="${TLS_DOMAINS:-}" + if [[ -n "${TLS_DOMAINS:-}" ]]; then + # do_tls_frontend keeps lb_ip in local scope; re-query + # gcloud here so the verify hook can read it. + Y_CLUSTER_CURRENT_REMOTE_LB_IP=$(gcloud compute addresses describe "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(address)' 2>/dev/null \ + || true) # y-script-lint:disable=or-true # absent IP -> empty var, hook decides + export Y_CLUSTER_CURRENT_REMOTE_LB_IP + export Y_CLUSTER_CURRENT_REMOTE_SCHEME=https + else + export Y_CLUSTER_CURRENT_REMOTE_LB_IP="" + export Y_CLUSTER_CURRENT_REMOTE_SCHEME=http + fi + export Y_CLUSTER_CURRENT_GCP_PROJECT="$GCP_PROJECT" + export Y_CLUSTER_CURRENT_GCP_ZONE="$GCP_ZONE" + export Y_CLUSTER_CURRENT_GCP_REGION="$GCP_REGION" +} + # do_teardown deletes GCP resources owned by this script's # NAME prefix in the configured project + zone. Reads the # same env vars as the build flow so a teardown after a @@ -733,6 +784,23 @@ stage "installing echo workload (Gateway listener + baseline route)" kubectl --context="$KUBECTX" -n y-cluster wait \ --for=condition=Available deployment/echo --timeout=180s +# Seed hook: caller-supplied cmd runs after echo is up but +# before PROMPT 1 / TLS_DOMAINS=auto resolution. Customer +# workloads applied here (mariadb, kafka, keycloak, HTTPRoute / +# GRPCRoute resources, etc.) populate /data/yolean for the data-seed +# extraction in prepare-export AND give TLS_DOMAINS=auto real +# hostnames to derive from. Non-zero exit aborts; local +# cluster stays up for inspection (set -e + the +# "aborted; local cluster left running" semantics of the +# upcoming PROMPT 1 path are what the operator falls back on). +if [[ -n "${APPLIANCE_SEED_CMD:-}" ]]; then + stage "applying seed (APPLIANCE_SEED_CMD)" + current_env + # set -o pipefail so a `cmd | tee log` chain in the + # caller's string doesn't swallow upstream failures. + bash -c "set -o pipefail; $APPLIANCE_SEED_CMD" +fi + # === Stage 2: hands-on prompt === SSH_KEY="$CACHE_DIR/$NAME-ssh" cat </dev/null || true # y-script-lint:disable=or-true # cleanup best-effort From 4654271a00a958a0a207a939d7257f9804b291e9 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Tue, 12 May 2026 08:12:09 +0000 Subject: [PATCH 13/16] fix(scripts/qemu-to-gcp): default to e2-standard-2 (e2-medium OOMs the stack) Observed an appliance build that ran fine for ~2h at 91-93% memory on a 4 GiB e2-medium, then died at 100% CPU / 3807 MiB used: ssh banner exchange timed out, :443 + :6443 went REFUSED while :80 kept LISTEN with the userspace too starved to respond. Classic OOM spiral. The full appliance stack (k3s + containerd + keycloak + envoy gateway + envoy proxy + mysql + kafka) sits within ~300 MiB of the 4 GiB ceiling at idle; any workload spike pushes it over. e2-standard-2 (2 vCPU / 8 GiB) gives the stack the headroom it needs. GCE machine types bundle CPU + memory, so there's no separate memory override -- that's spelled out in both the help text and the default-assignment comment so the next operator reading either spot sees why we don't surface a GCP_MEMORY knob. GCP_MACHINE_TYPE stays as the escape hatch for highmem / larger shapes. --- scripts/appliance-qemu-to-gcp.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 05fa44d..060dcc7 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -72,7 +72,10 @@ Environment: GCP_ZONE GCP zone (default: europe-north2-a) GCP_BUCKET GCS bucket for image tarballs (default: -appliance-images) - GCP_MACHINE_TYPE Compute Engine machine type (default: e2-medium) + GCP_MACHINE_TYPE Compute Engine machine type (default: e2-standard-2 -- + 2 vCPU / 8 GiB; e2-medium's 4 GiB OOMs the full + appliance stack mid-run. The type bundles CPU + memory, + so there's no separate memory knob.) GCP_IMAGE_FAMILY Image family tag (default: y-cluster-appliance) GCP_DATADIR_DISK Persistent disk for /data/yolean (default: appliance-gcp-datadir; preserved on teardown) @@ -147,7 +150,12 @@ fi GCP_REGION="${GCP_REGION:-europe-north2}" GCP_ZONE="${GCP_ZONE:-europe-north2-a}" GCP_BUCKET="${GCP_BUCKET:-${GCP_PROJECT}-appliance-images}" -GCP_MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-medium}" +# e2-standard-2: 2 vCPU / 8 GiB. The 4 GiB e2-medium OOMs once the +# appliance is running a non-trivial workload stack; 8 GiB is the +# floor we have validated. GCE machine types bundle CPU + memory so +# a separate memory knob is not meaningful -- override the whole +# type via GCP_MACHINE_TYPE for highmem / larger shapes. +GCP_MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-standard-2}" GCP_IMAGE_FAMILY="${GCP_IMAGE_FAMILY:-y-cluster-appliance}" GCP_DATADIR_DISK="${GCP_DATADIR_DISK:-appliance-gcp-datadir}" GCP_DATADIR_SIZE="${GCP_DATADIR_SIZE:-10GB}" From 009bea428bd3c610323dc67d3dae042755d63848 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 7 May 2026 10:30:31 +0000 Subject: [PATCH 14/16] GCP NEG endpoint re-attach is idempotent on re-runs (VM is recreated each build) --- scripts/appliance-qemu-to-gcp.sh | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 060dcc7..065a47f 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -456,6 +456,18 @@ do_tls_frontend() { gcloud compute network-endpoint-groups create "${NAME}-tls-neg" \ --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ --network-endpoint-type=GCE_VM_IP_PORT --default-port=80 >/dev/null + fi + # Always (re-)attach the VM endpoint, regardless of whether + # the NEG already existed. GCE auto-removes endpoints when + # the instance they reference is deleted, which Stage 9 does + # on every build (delete+recreate the VM for idempotency). + # If the NEG survived from a prior run but the VM was recreated, + # its endpoint reference is gone and the LB has no backend. + # Skip the add when the endpoint is already attached so re-runs + # without VM recreation stay quiet. + if ! gcloud compute network-endpoint-groups list-network-endpoints "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --format='value(instance)' 2>/dev/null | grep -Fxq "$VM_NAME"; then gcloud compute network-endpoint-groups update "${NAME}-tls-neg" \ --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ --add-endpoint="instance=$VM_NAME,port=80" >/dev/null @@ -1264,14 +1276,17 @@ Appliance live in GCP. VMware -- same disk state) Connect: - ssh -i $SSH_KEY ystack@$PUBLIC_IP - curl http://$PUBLIC_IP/ - -kubectl from your laptop (apiserver not externally exposed): - ssh -L 6443:127.0.0.1:6443 -N -i $SSH_KEY ystack@$PUBLIC_IP & + # One-time per appliance, fetch the kubeconfig onto the laptop. + # (ssh can't stream a remote file to a local path AND give you + # an interactive shell on the same connection -- stdin/stdout + # is owned by the shell -- so this fetch is its own one-shot + # ssh, separate from the interactive one below.) ssh -i $SSH_KEY ystack@$PUBLIC_IP sudo cat /etc/rancher/k3s/k3s.yaml \\ - > k3s-$VM_NAME.yaml - KUBECONFIG=k3s-$VM_NAME.yaml kubectl get nodes + > ~/.kube/y-appliance-portforwarded + + # Interactive shell + apiserver tunnel; the tunnel stays up + # until you exit the shell. + ssh -i $SSH_KEY -L 6443:127.0.0.1:6443 ystack@$PUBLIC_IP Teardown when done: $0 teardown From ee8c01e777d61cbc51e505d25ea696480167be90 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Tue, 12 May 2026 08:24:17 +0000 Subject: [PATCH 15/16] fix(scripts/qemu-to-gcp): drop apostrophes from YHELP block The previous commit added "e2-medium's" and "there's" inside the single-quoted YHELP heredoc. Single quotes in bash can't contain single quotes, so the apostrophes terminated the string mid-block; the resumed unquoted "4 GiB OOMs ..." got parsed as a command, and any consumer that sourced or executed the help block saw "line 76: 4: command not found". Reworded to avoid the apostrophes entirely. bash -n parses the file clean and --help renders the section as intended. --- scripts/appliance-qemu-to-gcp.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 065a47f..32248e7 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -73,9 +73,9 @@ Environment: GCP_BUCKET GCS bucket for image tarballs (default: -appliance-images) GCP_MACHINE_TYPE Compute Engine machine type (default: e2-standard-2 -- - 2 vCPU / 8 GiB; e2-medium's 4 GiB OOMs the full + 2 vCPU / 8 GiB; the 4 GiB e2-medium OOMs the full appliance stack mid-run. The type bundles CPU + memory, - so there's no separate memory knob.) + so memory is not a separate knob.) GCP_IMAGE_FAMILY Image family tag (default: y-cluster-appliance) GCP_DATADIR_DISK Persistent disk for /data/yolean (default: appliance-gcp-datadir; preserved on teardown) From 61dd4959b2d7207efb3bab63a29f574c3decdba7 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Tue, 12 May 2026 13:59:18 +0000 Subject: [PATCH 16/16] feat(scripts): fail-fast schema checks on GCP_KEY and H_S3_REGION Both files-pointed-at-by-env-var inputs surfaced the same foot-gun: a malformed value passed the existence check but failed deep inside the tool we shelled into, with a less helpful message: - GCP_KEY pointing at a truncated / wrong-format JSON (e.g. a re-exported key that lost its private_key during a copy-paste) only erred at `gcloud auth activate-service-account`, by which point the operator has already proven the file exists. Now `jq -e` checks that the four fields GCP requires for a service-account auth are populated -- type=service_account, project_id, client_email, private_key -- and errors with the missing field names so the operator knows what to fix. - H_S3_REGION accepted any string and only surfaced "could not resolve host" when the upload URL hit a non-existent endpoint hostname. The help text already documents the valid set (fsn1, hel1, nbg1); now the script enforces it at config-load time with a message naming the valid values. Both checks fire BEFORE any cloud-side state change. Adds no new dependency: jq is already required by the broader appliance flow. --- scripts/appliance-publish-hetzner.sh | 11 +++++++++++ scripts/appliance-qemu-to-gcp.sh | 11 +++++++++++ scripts/e2e-appliance-qemu-to-gcp.sh | 8 ++++++++ 3 files changed, 30 insertions(+) diff --git a/scripts/appliance-publish-hetzner.sh b/scripts/appliance-publish-hetzner.sh index 71a85e2..397f274 100755 --- a/scripts/appliance-publish-hetzner.sh +++ b/scripts/appliance-publish-hetzner.sh @@ -108,6 +108,17 @@ fi : "${H_S3_REGION:?H_S3_REGION not set in $ENV_FILE (fsn1, hel1, or nbg1)}" : "${H_S3_BUCKET:?H_S3_BUCKET not set; pass via env or env file}" +# Enum-check H_S3_REGION before we hit the endpoint URL. A typo +# silently lands on an invalid hostname; this catches it at +# config-load time with a message naming the valid set. +case "$H_S3_REGION" in + fsn1|hel1|nbg1) ;; + *) + echo "H_S3_REGION=$H_S3_REGION not recognised; valid: fsn1, hel1, nbg1" >&2 + exit 1 + ;; +esac + BUCKET="$H_S3_BUCKET" REGION="$H_S3_REGION" ENDPOINT="https://${REGION}.your-objectstorage.com" diff --git a/scripts/appliance-qemu-to-gcp.sh b/scripts/appliance-qemu-to-gcp.sh index 32248e7..acdcd52 100755 --- a/scripts/appliance-qemu-to-gcp.sh +++ b/scripts/appliance-qemu-to-gcp.sh @@ -679,6 +679,17 @@ if [[ ! -f "$GCP_KEY" ]]; then echo "create it with: scripts/gcp-bootstrap-credentials.sh on a machine with gcloud Owner access" >&2 exit 1 fi +# Fail fast on a malformed key file (truncated, wrong export +# format, expired exports that lost their private_key) BEFORE +# gcloud activate-service-account errors with a less helpful +# message. The four fields below are the minimum GCP requires +# to authenticate as a service account. +if ! jq -e 'all(.type == "service_account"; .) and (.project_id // empty | length > 0) and (.client_email // empty | length > 0) and (.private_key // empty | length > 0)' "$GCP_KEY" >/dev/null 2>&1; then + echo "GCP key at $GCP_KEY is missing required fields" >&2 + echo " expected JSON with: type=service_account, project_id, client_email, private_key" >&2 + echo " regenerate via: scripts/gcp-bootstrap-credentials.sh" >&2 + exit 1 +fi export GOOGLE_APPLICATION_CREDENTIALS="$GCP_KEY" # Acknowledge parallel composite uploads up front. The setting diff --git a/scripts/e2e-appliance-qemu-to-gcp.sh b/scripts/e2e-appliance-qemu-to-gcp.sh index 8f16523..073d933 100755 --- a/scripts/e2e-appliance-qemu-to-gcp.sh +++ b/scripts/e2e-appliance-qemu-to-gcp.sh @@ -173,6 +173,14 @@ if [[ ! -f "$GCP_KEY" ]]; then echo "create it with: scripts/gcp-bootstrap-credentials.sh" >&2 exit 1 fi +# Same schema check as the interactive flow -- mismatch on a +# truncated / wrong-shape JSON should fail fast here rather +# than two gcloud calls in. +if ! jq -e 'all(.type == "service_account"; .) and (.project_id // empty | length > 0) and (.client_email // empty | length > 0) and (.private_key // empty | length > 0)' "$GCP_KEY" >/dev/null 2>&1; then + echo "GCP key at $GCP_KEY is missing required fields" >&2 + echo " expected JSON with: type=service_account, project_id, client_email, private_key" >&2 + exit 1 +fi export GOOGLE_APPLICATION_CREDENTIALS="$GCP_KEY" # Acknowledge parallel composite uploads up front (silences