diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f687ae0 --- /dev/null +++ b/.env.example @@ -0,0 +1,35 @@ +# Copy to .env (gitignored) and fill in your values: +# cp .env.example .env +# +# Sourced from the repo root by the scripts under scripts/. Each +# script bails with `set GCP_PROJECT in .env` (or similar) if a +# required value is unset. + +# === GCP appliance flow (scripts/appliance-qemu-to-gcp.sh) === + +# GCP project where appliance images, VMs, and the persistent +# data disk live. The bundled service account +# (gcp-bootstrap-credentials.sh) must have roles/owner here. +# Example: my-org-appliance +GCP_PROJECT= + +# Path to the GCP service-account JSON. Created via +# scripts/gcp-bootstrap-credentials.sh on a machine with +# gcloud Owner access; copy the JSON to this path on the +# build machine and chmod 600. +# Example: ~/.config/y-cluster/gcp-appliance.key.json +GCP_KEY= + +# === Hetzner Object Storage publish (scripts/appliance-publish-hetzner.sh) === + +# Path to a file containing HCLOUD_TOKEN and Hetzner Object +# Storage credentials. Format documented in the script's header. +# Example: ~/.config/y-cluster/hetzner.env +H_S3_ENV_FILE= + +# === Hetzner Cloud Packer flow (scripts/e2e-appliance-hetzner.sh) === + +# Same file as H_S3_ENV_FILE when HCLOUD_TOKEN + S3 creds +# co-locate; separate ENV_FILE var historically. +# Example: ~/.config/y-cluster/hetzner.env +ENV_FILE= diff --git a/.gitignore b/.gitignore index 3d0a7ca..e5be3bd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ /kustomize-traverse /dist/ + +# operator-local defaults sourced by the appliance scripts +.env diff --git a/scripts/appliance-build-hetzner.sh b/scripts/appliance-build-hetzner.sh new file mode 100755 index 0000000..53893b9 --- /dev/null +++ b/scripts/appliance-build-hetzner.sh @@ -0,0 +1,416 @@ +#!/usr/bin/env bash +# Build a y-cluster appliance interactively: stand up a local +# qemu cluster with the same fixtures we'll ship, give the +# operator a chance to poke at it, then on confirm run a +# Packer-built Hetzner snapshot and provision a server from +# it. Shows ssh + curl details for both stages. +# +# Why two clusters: the local one is for hands-on verification +# (kubectl / ssh / poke). The Hetzner one is the actual handoff. +# They're built from the same testdata fixtures, so verifying +# locally proves the fixture set; Packer rebuilds the snapshot +# fresh inside Hetzner. No round-trip artefact transfer between +# the two -- they're independent builds with shared inputs. +# +# Two confirmations: +# 1. "Local cluster looks good -- build Hetzner snapshot?" +# 2. "Snapshot ready -- create server from snapshot?" +# Either prompt aborts non-destructively. Aborting at (1) +# leaves the local cluster up; aborting at (2) leaves the +# Hetzner snapshot in your project for later use. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='appliance-build-hetzner.sh - local verify -> confirm -> Packer snapshot -> confirm -> Hetzner server + +Usage: appliance-build-hetzner.sh + +Environment: + ENV_FILE Hetzner credentials file (set in .env or shell env; required) + HCLOUD_TOKEN Hetzner Cloud API token (sourced from ENV_FILE) + NAME Local cluster name (default: appliance-hetzner-build) + APP_HTTP_PORT Override host port for guest 80 (y-cluster default: 80) + APP_HTTPS_PORT Override host port for guest 443 (y-cluster default: 443) + APP_API_PORT Override host port for guest 6443 (y-cluster default: 6443) + APP_SSH_PORT Override host port for guest 22 (y-cluster default: 2222) + SERVER_NAME Hetzner server name (default: y-cluster-appliance) + SERVER_TYPE Hetzner server type (default: cx23) + SERVER_LOCATION Hetzner location (default: hel1) + SNAPSHOT_NAME Packer snapshot description (default: y-cluster-appliance-) + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) + KEEP_LOCAL Set to keep the local cluster after Hetzner deploy (default: tear down) + ASSUME_YES Set to skip BOTH confirmations and proceed end-to-end + +Dependencies: + go, qemu-system-x86_64, kubectl, ssh, ssh-keygen, curl, packer, hcloud +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${ENV_FILE:?set ENV_FILE in .env or shell env}" + +NAME="${NAME:-appliance-hetzner-build}" +SERVER_NAME="${SERVER_NAME:-y-cluster-appliance}" +SERVER_TYPE="${SERVER_TYPE:-cx23}" +SERVER_LOCATION="${SERVER_LOCATION:-hel1}" +SNAPSHOT_NAME="${SNAPSHOT_NAME:-y-cluster-appliance-$(date -u +%Y%m%d-%H%M%S)}" + +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" +PACKER_TEMPLATE="$REPO_ROOT/scripts/e2e-appliance-hetzner.pkr.hcl" + +# Keep CFG_DIR stable + outside CACHE_DIR (the cleanup glob in the +# qemu provisioner would otherwise match this directory and rm -f +# would bail, killing the script under set -e). Same convention as +# scripts/appliance-build-virtualbox.sh. +CFG_DIR="${CFG_DIR:-$HOME/.cache/y-cluster-appliance-build/$NAME}" + +# Stable location for the per-deploy ssh key so the operator can +# ssh into the Hetzner server later. Survives across script runs +# unless they delete the file or run with a fresh SERVER_NAME. +HCLOUD_KEY_DIR="$HOME/.cache/y-cluster-appliance-build/hetzner-keys" +HCLOUD_KEY="$HCLOUD_KEY_DIR/$SERVER_NAME" + +stage() { printf '\n=== %s ===\n' "$*"; } + +confirm() { + local prompt=$1 + if [[ -n "${ASSUME_YES:-}" ]]; then + echo "ASSUME_YES set; proceeding ($prompt)" + return 0 + fi + read -r -p "$prompt [y/N] " answer + case "${answer,,}" in + y|yes) return 0 ;; + *) return 1 ;; + esac +} + +cat <<'WARN' + +================================================================ +DEPRECATION WARNING + +scripts/appliance-build-hetzner.sh is on the way out. + +Hetzner Cloud has no public API for uploading custom disk +images, so this script's "build a Hetzner snapshot" stage is +a fresh build inside Hetzner via Packer -- the local-qemu +verification you do first is fixture-equivalence, NOT the same +disk that ships. That mismatches the appliance contract +(local-built disk = disk that boots elsewhere). + +Replacement plan: + - scripts/appliance-qemu-to-gcp.sh (in progress) takes the + appliance contract path: provision local, export disk, + upload to GCP via `gcloud compute images import`, boot a + VM from that uploaded image. Same disk you verified + locally is the disk GCP runs. + - scripts/e2e-appliance-hetzner.sh is being repurposed once + a pkg/provision/hetzner/ provisioner exists; it will then + cover provision-on-Hetzner -> snapshot -> instantiate as + an end-to-end test of that provisioner shape. + +This script still runs. It still produces a working appliance +on Hetzner. But the artefact you ship is built fresh on +Hetzner, not transferred from your local verification. +================================================================ + +WARN +confirm "Proceed with the Hetzner Packer flow anyway?" \ + || { echo "aborted; no changes made."; exit 0; } + + +for tool in go qemu-system-x86_64 kubectl ssh ssh-keygen curl packer hcloud; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +if [[ ! -f "$ENV_FILE" ]]; then + echo "missing env file: $ENV_FILE (need HCLOUD_TOKEN)" >&2 + exit 1 +fi +# shellcheck disable=SC1090 +source "$ENV_FILE" +[[ -n "${HCLOUD_TOKEN:-}" ]] || { echo "HCLOUD_TOKEN not set in $ENV_FILE" >&2; exit 1; } +export HCLOUD_TOKEN + +# === Build dev binary (linux/amd64 because Packer uploads it) === +stage "building linux/amd64 dev binary -> $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && GOOS=linux GOARCH=amd64 go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === Local config === +mkdir -p "$CFG_DIR" +# YAML emission omits any port the operator didn't override, letting +# y-cluster's Go binary apply its own defaults (sshPort=2222, +# portForwards={6443:6443, 80:80, 443:443}). +{ + echo "provider: qemu" + echo "name: $NAME" + echo "context: $NAME" + [ -n "${APP_SSH_PORT:-}" ] && printf 'sshPort: "%s"\n' "$APP_SSH_PORT" + echo 'memory: "4096"' + echo 'cpus: "2"' + echo 'diskSize: "40G"' + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + echo "portForwards:" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" + fi +} > "$CFG_DIR/y-cluster-provision.yaml" + +# === Stage 1: local provision + install + smoketest === +stage "tearing down any leftover $NAME cluster" +"$Y_CLUSTER" teardown -c "$CFG_DIR" || true # y-script-lint:disable=or-true # idempotent re-entry: missing cluster is not an error + +stage "provisioning local appliance ($NAME) -- k3s + Envoy Gateway" +"$Y_CLUSTER" provision -c "$CFG_DIR" + +stage "installing echo workload" +"$Y_CLUSTER" echo render \ + | kubectl --context="$NAME" apply --server-side --field-manager=customer-install -f - +kubectl --context="$NAME" -n y-cluster wait \ + --for=condition=Available deployment/echo --timeout=180s + +stage "installing VersityGW StatefulSet via yconverge" +"$Y_CLUSTER" yconverge --context="$NAME" \ + -k "$REPO_ROOT/testdata/appliance-stateful/base" + +stage "smoketest: echo + s3" +probe_local() { + local what=$1 url=$2 attempts=${3:-30} + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o /dev/null -w " $what HTTP %{http_code}\n" "$url"; then + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 5 + done + echo "$what smoketest never succeeded; aborting" >&2 + return 1 +} +probe_local echo "http://127.0.0.1:${APP_HTTP_PORT:-80}/q/envoy/echo" +probe_local s3 "http://127.0.0.1:${APP_HTTP_PORT:-80}/s3/health" + +cat </dev/null || true # y-script-lint:disable=or-true # cleanup best-effort +fi + +# Pre-render the kustomize bases for Packer (the build VM doesn't +# have y-cluster, so it can't run yconverge; concat both module +# outputs into a single kubectl-applyable file). Same shape as +# scripts/e2e-appliance-hetzner.sh. +STATEFUL_MANIFEST=$(mktemp -t appliance-stateful.XXXXXX.yaml) +{ + kubectl kustomize "$REPO_ROOT/testdata/appliance-stateful/namespace" + echo '---' + kubectl kustomize "$REPO_ROOT/testdata/appliance-stateful/base" +} > "$STATEFUL_MANIFEST" + +LOCALSTORAGE_MANIFEST=$(mktemp -t y-cluster-localstorage.XXXXXX.yaml) +"$Y_CLUSTER" localstorage render > "$LOCALSTORAGE_MANIFEST" + +trap 'rm -f "$STATEFUL_MANIFEST" "$LOCALSTORAGE_MANIFEST"' EXIT + +stage "packer init" +packer init "$PACKER_TEMPLATE" + +stage "packer build (creates a temporary $SERVER_TYPE in $SERVER_LOCATION, snapshots, deletes)" +packer build \ + -var "snapshot_name=$SNAPSHOT_NAME" \ + -var "server_type=$SERVER_TYPE" \ + -var "location=$SERVER_LOCATION" \ + -var "y_cluster_binary=$Y_CLUSTER" \ + -var "prepare_script=$REPO_ROOT/pkg/provision/qemu/prepare_inguest.sh" \ + -var "stateful_manifest=$STATEFUL_MANIFEST" \ + -var "localstorage_manifest=$LOCALSTORAGE_MANIFEST" \ + "$PACKER_TEMPLATE" + +# Resolve snapshot id from the description we gave Packer. +stage "resolving snapshot id for $SNAPSHOT_NAME" +SNAPSHOT_ID=$(hcloud image list \ + --type=snapshot \ + --selector="purpose=y-cluster-appliance" \ + --output=json \ + | python3 -c " +import json, sys +images = json.load(sys.stdin) +matches = [i for i in images if i.get('description') == '$SNAPSHOT_NAME'] +if not matches: + sys.exit('no snapshot named $SNAPSHOT_NAME found') +print(matches[0]['id']) +") +echo " snapshot id: $SNAPSHOT_ID" + +cat < +================================================================ + +EOF + +confirm "Create Hetzner server from snapshot $SNAPSHOT_ID?" \ + || { echo "aborted; snapshot $SNAPSHOT_ID preserved for later use."; exit 0; } + +# === Stage 3: create server + verify === +mkdir -p "$HCLOUD_KEY_DIR" +chmod 700 "$HCLOUD_KEY_DIR" +if [[ ! -f "$HCLOUD_KEY" ]]; then + ssh-keygen -t ed25519 -N '' -C "$SERVER_NAME-$$" -f "$HCLOUD_KEY" -q +fi +KEY_NAME="$SERVER_NAME" + +stage "tearing down any leftover server / key from a prior run" +hcloud server delete "$SERVER_NAME" 2>/dev/null || true # y-script-lint:disable=or-true # idempotent cleanup: missing server is not an error +hcloud ssh-key delete "$KEY_NAME" 2>/dev/null || true # y-script-lint:disable=or-true # idempotent cleanup: missing key is not an error + +stage "registering ssh public key as $KEY_NAME" +hcloud ssh-key create --name "$KEY_NAME" --public-key-from-file "$HCLOUD_KEY.pub" >/dev/null + +stage "creating $SERVER_NAME from snapshot $SNAPSHOT_ID" +hcloud server create \ + --name "$SERVER_NAME" \ + --type "$SERVER_TYPE" \ + --image "$SNAPSHOT_ID" \ + --location "$SERVER_LOCATION" \ + --ssh-key "$KEY_NAME" \ + >/dev/null +PUBLIC_IP=$(hcloud server ip "$SERVER_NAME") +echo " public ip: $PUBLIC_IP" + +# Wait for sshd, then probe the workload endpoints. +SSH_OPTS="-i $HCLOUD_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" +echo " waiting for ssh on $PUBLIC_IP:22" +for _ in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS root@"$PUBLIC_IP" 'true' 2>/dev/null; then + break + fi + sleep 5 +done + +# Cold boot from snapshot: cloud-init -> k3s.service first start -> +# envoy gateway controller + data plane -> VersityGW StatefulSet +# rebinds its PV -> klipper-lb binds :80. Generous loop. +probe_remote() { + local what=$1 url=$2 attempts=${3:-60} + local out + out=$(mktemp) + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o "$out" -w " $what HTTP %{http_code}\n" "$url"; then + echo + echo "=== $what response (head) ===" + head -25 "$out" + echo + rm -f "$out" + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 10 + done + rm -f "$out" + return 1 +} + +stage "probing http://$PUBLIC_IP -- echo + s3" +if probe_remote echo "http://$PUBLIC_IP/q/envoy/echo" \ + && probe_remote s3 "http://$PUBLIC_IP/s3/health"; then + cat < k3s-$SERVER_NAME.yaml + KUBECONFIG=k3s-$SERVER_NAME.yaml kubectl get nodes + (k3s's apiserver isn't open to the internet by default; either + add 6443 to the Hetzner firewall, or tunnel via ssh: + ssh -L 6443:127.0.0.1:6443 -N root@$PUBLIC_IP &) + +When you're done: + hcloud server delete $SERVER_NAME + hcloud ssh-key delete $KEY_NAME + hcloud image delete $SNAPSHOT_ID # optional; snapshot is reusable +================================================================ +EOF + exit 0 +fi + +echo >&2 +echo "echo never answered. Server $SERVER_NAME left running for diagnosis:" >&2 +# shellcheck disable=SC2086 +ssh $SSH_OPTS root@"$PUBLIC_IP" 'systemctl is-active k3s; kubectl get pods -A 2>&1 | head -30' >&2 \ + || true # y-script-lint:disable=or-true # diagnostic best-effort +echo " ssh: ssh -i $HCLOUD_KEY root@$PUBLIC_IP" >&2 +echo " destroy: hcloud server delete $SERVER_NAME" >&2 +exit 1 diff --git a/scripts/appliance-build-virtualbox.sh b/scripts/appliance-build-virtualbox.sh new file mode 100755 index 0000000..a7ed227 --- /dev/null +++ b/scripts/appliance-build-virtualbox.sh @@ -0,0 +1,250 @@ +#!/usr/bin/env bash +# Build a y-cluster appliance and pause for hands-on testing +# before exporting a VirtualBox-friendly bundle. +# +# Same provision shape as scripts/e2e-appliance-export-import.sh: +# qemu provider, k3s + Envoy Gateway, echo workload, VersityGW +# StatefulSet (covers stateful PV path). Then it stops, prints +# kubectl + ssh access info, and waits for the operator to +# confirm before running prepare-export + export. +# +# Why interactive: the VirtualBox handoff is precious. We want +# the operator to sanity-check the live cluster before we lock +# the disk for export and (optionally) tear it down. Yes lets +# y-cluster prepare-export + export run; "no" leaves the cluster +# up for further poking (and prints the teardown command). +# +# The bundled VMDK uses subformat=monolithicSparse, which +# imports more cleanly under VirtualBox's "Use Existing Virtual +# Hard Disk File" than the streamOptimized default that ships +# for ESXi. The README inside the bundle documents both. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='appliance-build-virtualbox.sh - provision -> install -> pause -> export VirtualBox-friendly VMDK + +Usage: appliance-build-virtualbox.sh [bundle-dir] + +Positional: + bundle-dir Where to write the export bundle. Default: + ./dist/appliance-virtualbox/- + +Environment: + NAME Appliance name (default: appliance-virtualbox) + APP_HTTP_PORT Override host port for guest 80 (y-cluster default: 80) + APP_HTTPS_PORT Override host port for guest 443 (y-cluster default: 443) + APP_API_PORT Override host port for guest 6443 (y-cluster default: 6443) + APP_SSH_PORT Override host port for guest 22 (y-cluster default: 2222) + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) + KEEP_CLUSTER Set to keep the cluster alive after export (default: tear it down) + SKIP_PROVISION Set to skip provision + install (resume into the prompt against + an already-running cluster of the same NAME) + ASSUME_YES Set to skip the interactive prompt and proceed to export + +Dependencies: + go, qemu-system-x86_64, qemu-img, kubectl, ssh, ssh-keygen, curl, virt-sysprep +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +NAME="${NAME:-appliance-virtualbox}" + +REPO_ROOT="$(git rev-parse --show-toplevel)" +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" + +DEFAULT_BUNDLE="$REPO_ROOT/dist/appliance-virtualbox/$NAME-$(date -u +%Y%m%dT%H%M%SZ)" +BUNDLE_DIR="${1:-$DEFAULT_BUNDLE}" + +# CFG_DIR lives OUTSIDE $CACHE_DIR on purpose: the cleanup glob +# below ("$CACHE_DIR/$NAME-"*) would otherwise match a config +# directory whose name starts with $NAME, and rm -f bails on +# directories under set -e. Keep it stable (not mktemp -d) so +# SKIP_PROVISION can resume against an existing cluster. +CFG_DIR="${CFG_DIR:-$HOME/.cache/y-cluster-appliance-build/$NAME}" + +stage() { printf '\n=== %s ===\n' "$*"; } + +for tool in go qemu-system-x86_64 qemu-img kubectl ssh ssh-keygen curl virt-sysprep; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +# virt-sysprep needs to read /boot/vmlinuz-* (libguestfs supermin +# builds an appliance VM with the host kernel). Ubuntu installs +# kernel images 0600 root, so non-root invocations bail with an +# opaque "supermin exited with error status 1". Surface the fix. +if ! [ -r /boot/vmlinuz-"$(uname -r)" ]; then + cat >&2 < $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === Config (always written; teardown + prepare-export need it) === +mkdir -p "$CFG_DIR" +# YAML emission omits any port the operator didn't override, letting +# y-cluster's Go binary apply its own defaults (sshPort=2222, +# portForwards={6443:6443, 80:80, 443:443}). +{ + echo "provider: qemu" + echo "name: $NAME" + echo "context: $NAME" + [ -n "${APP_SSH_PORT:-}" ] && printf 'sshPort: "%s"\n' "$APP_SSH_PORT" + echo 'memory: "4096"' + echo 'cpus: "2"' + echo 'diskSize: "40G"' + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + echo "portForwards:" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" + fi +} > "$CFG_DIR/y-cluster-provision.yaml" + +if [[ -z "${SKIP_PROVISION:-}" ]]; then + stage "tearing down any leftover $NAME cluster" + "$Y_CLUSTER" teardown -c "$CFG_DIR" || true # y-script-lint:disable=or-true # idempotent re-entry: missing cluster is not an error + rm -f "$CACHE_DIR/$NAME".* "$CACHE_DIR/$NAME-"* + + stage "provisioning $NAME (k3s + Envoy Gateway)" + "$Y_CLUSTER" provision -c "$CFG_DIR" + + stage "installing echo workload" + "$Y_CLUSTER" echo render \ + | kubectl --context="$NAME" apply --server-side --field-manager=customer-install -f - + kubectl --context="$NAME" -n y-cluster wait \ + --for=condition=Available deployment/echo --timeout=180s + + stage "installing VersityGW StatefulSet via yconverge" + "$Y_CLUSTER" yconverge --context="$NAME" \ + -k "$REPO_ROOT/testdata/appliance-stateful/base" + + stage "smoketest: echo + s3" + probe() { + local what=$1 url=$2 attempts=${3:-30} + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o /dev/null -w " $what HTTP %{http_code}\n" "$url"; then + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 5 + done + echo "$what smoketest never succeeded; aborting" >&2 + return 1 + } + probe echo "http://127.0.0.1:${APP_HTTP_PORT:-80}/q/envoy/echo" + probe s3 "http://127.0.0.1:${APP_HTTP_PORT:-80}/s3/health" +else + stage "SKIP_PROVISION set; resuming against existing $NAME cluster" +fi + +# === Interactive pause for hands-on testing === +SSH_KEY="$CACHE_DIR/$NAME-ssh" + +cat < Import Appliance wizard accepts +# only OVF / OVA, NOT raw VMDK -- so we ship OVA. The OVF +# carries the CPU/RAM/NIC hints; VirtualBox just needs port +# forwards added post-import. +stage "exporting OVA (VirtualBox-importable) -> $BUNDLE_DIR" +mkdir -p "$(dirname "$BUNDLE_DIR")" +"$Y_CLUSTER" export \ + --context="$NAME" \ + --format=ova \ + "$BUNDLE_DIR" + +ls -la "$BUNDLE_DIR/" +echo +echo " bundled .ova members:" +tar tvf "$BUNDLE_DIR/$NAME.ova" | sed 's/^/ /' + +cat < Import Appliance -> select $BUNDLE_DIR/$NAME.ova + 2. Confirm CPU / RAM / disk on the wizard (defaults come + from the OVF: $(awk '/cpus/{print $2}' "$CFG_DIR/y-cluster-provision.yaml") vCPU, $(awk '/memory/{print $2}' "$CFG_DIR/y-cluster-provision.yaml") MiB RAM) + 3. After import: Network -> Adapter 1 -> Advanced -> Port + Forwarding, add: + ssh TCP host 2222 -> guest 22 + http TCP host 8080 -> guest 80 + https TCP host 8443 -> guest 443 + 4. Start. SSH key + access details in $BUNDLE_DIR/README.md +================================================================ +EOF + +if [[ -z "${KEEP_CLUSTER:-}" ]]; then + stage "tearing down build-side cluster (set KEEP_CLUSTER=1 to keep it)" + "$Y_CLUSTER" teardown -c "$CFG_DIR" 2>/dev/null || true # y-script-lint:disable=or-true # cleanup best-effort +fi diff --git a/scripts/appliance-publish-hetzner.sh b/scripts/appliance-publish-hetzner.sh new file mode 100755 index 0000000..397f274 --- /dev/null +++ b/scripts/appliance-publish-hetzner.sh @@ -0,0 +1,323 @@ +#!/usr/bin/env bash +# Idempotently ensure a Hetzner Object Storage bucket exists, +# configured to allow public GET on individual objects but NOT +# bucket listing, then upload a single file and print its public +# URL. +# +# Use case: the operator runs scripts/appliance-build-virtualbox.sh +# to produce a VMDK bundle, then this script to publish the +# bundle (or a tarball of it) at a URL their test host can curl +# while staying anonymous. +# +# Hetzner Object Storage is S3-compatible; we shell out to the +# AWS CLI pointed at https://.your-objectstorage.com. +# If `aws` is not installed locally we run the official image +# via docker, which is universally available on dev machines. +# +# Credentials live in $H_S3_ENV_FILE (set in .env or shell env; +# typically the same file that holds HCLOUD_TOKEN). The file +# should set: +# H_S3_ACCESS_KEY= Object +# Storage -> Credentials> +# H_S3_SECRET_KEY=... +# H_S3_REGION=fsn1 # or hel1 / nbg1 +# H_S3_BUCKET=... # default bucket (script arg overrides) +# +# These are SEPARATE from HCLOUD_TOKEN: Object Storage is +# managed under the same project but the API uses dedicated +# S3 access/secret keys, not the Cloud API token. We co-locate +# them in the same env file because they share a project, not +# because they share an auth scheme. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='appliance-publish-hetzner.sh - upload a file to a Hetzner Object Storage bucket with public-read on objects (no listing) + +Usage: appliance-publish-hetzner.sh [object-key] + +Positional: + file Local path to upload + object-key Key to write under in the bucket (default: basename of file) + +Environment: + H_S3_ENV_FILE Path to env file with H_S3_* vars (set in .env or shell env; required) + H_S3_BUCKET Bucket name; overrides the env file. Required if not in env file. + H_S3_REGION Region; overrides the env file (fsn1, hel1, or nbg1). + AWS_CLI How to invoke aws. Default: local `aws` if on PATH, + else `docker run --rm -i public.ecr.aws/aws-cli/aws-cli`. + +Examples: + # publish a fresh appliance bundle + ./scripts/appliance-publish-hetzner.sh \ + dist/appliance-virtualbox/appliance-virtualbox-*/appliance-virtualbox.vmdk + + # publish under a custom key + ./scripts/appliance-publish-hetzner.sh appliance.tar.gz releases/2026-05-01/appliance.tar.gz + +Dependencies: + curl, and one of: locally-installed `aws` (preferred) OR `docker` + (used to invoke public.ecr.aws/aws-cli/aws-cli when aws is missing) +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; + "") echo "$YHELP" >&2; exit 2 ;; +esac + +INPUT="$1" +KEY_OVERRIDE="${2:-}" + +stage() { printf '\n=== %s ===\n' "$*"; } + +if [[ ! -e "$INPUT" ]]; then + echo "path not found: $INPUT" >&2 + exit 1 +fi + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${H_S3_ENV_FILE:?set H_S3_ENV_FILE in .env or shell env}" +ENV_FILE="$H_S3_ENV_FILE" +if [[ -f "$ENV_FILE" ]]; then + # shellcheck disable=SC1090 + set -a; . "$ENV_FILE"; set +a +else + echo "credentials file not found: $ENV_FILE" >&2 + cat >&2 </object-storage/credentials +EOF + exit 1 +fi + +: "${H_S3_ACCESS_KEY:?H_S3_ACCESS_KEY not set in $ENV_FILE}" +: "${H_S3_SECRET_KEY:?H_S3_SECRET_KEY not set in $ENV_FILE}" +: "${H_S3_REGION:?H_S3_REGION not set in $ENV_FILE (fsn1, hel1, or nbg1)}" +: "${H_S3_BUCKET:?H_S3_BUCKET not set; pass via env or env file}" + +# Enum-check H_S3_REGION before we hit the endpoint URL. A typo +# silently lands on an invalid hostname; this catches it at +# config-load time with a message naming the valid set. +case "$H_S3_REGION" in + fsn1|hel1|nbg1) ;; + *) + echo "H_S3_REGION=$H_S3_REGION not recognised; valid: fsn1, hel1, nbg1" >&2 + exit 1 + ;; +esac + +BUCKET="$H_S3_BUCKET" +REGION="$H_S3_REGION" +ENDPOINT="https://${REGION}.your-objectstorage.com" + +# === Decide what to upload === +# Two modes: +# bundle - INPUT is a directory that looks like a y-cluster +# bundle (or a file inside one, identified by a +# sibling README.md). We tar `-C parent dirname` so +# the tarball extracts to a sibling directory in the +# customer's CWD: `tar xzf .tgz` produces +# `.//{README.md, *.vmdk, *-ssh, *-ssh.pub}`. +# single - INPUT is a regular file with no bundle context. +# Upload as-is. Key defaults to its basename. +# Bundle mode is preferred whenever a README.md sits next to +# the disk file, so the operator can pass either the directory +# or the .vmdk and get the same bundle-tarball result. +SOURCE_FILE="" +KEY="" +BUNDLE_DIR="" + +if [[ -d "$INPUT" ]]; then + BUNDLE_DIR=$(realpath "$INPUT") +elif [[ -f "$INPUT" && -f "$(dirname "$INPUT")/README.md" ]]; then + BUNDLE_DIR=$(realpath "$(dirname "$INPUT")") +fi + +if [[ -n "$BUNDLE_DIR" ]]; then + bundle_name=$(basename "$BUNDLE_DIR") + bundle_parent=$(dirname "$BUNDLE_DIR") + # Write the tarball next to the bundle dir, NOT under /tmp. + # /tmp is tmpfs on most distros (~16 GB) and a 1.5 GiB + # appliance tarball easily exhausts it; bundle_parent is on + # the operator's chosen output volume where space matches + # the bundle size. + TGZ="$bundle_parent/.${bundle_name}.$$.tgz" + trap 'rm -f "$TGZ"' EXIT + stage "packing bundle $BUNDLE_DIR -> $TGZ" + tar -czf "$TGZ" -C "$bundle_parent" "$bundle_name" + SOURCE_FILE="$TGZ" + KEY="${KEY_OVERRIDE:-${bundle_name}.tgz}" +else + SOURCE_FILE="$INPUT" + KEY="${KEY_OVERRIDE:-$(basename "$INPUT")}" +fi + +PUBLIC_URL="https://${BUCKET}.${REGION}.your-objectstorage.com/${KEY}" + +# === Pick an AWS CLI invocation === +# Prefer a local `aws` to avoid pulling a 200MB image on every +# run; fall back to docker so a fresh dev box doesn't have to +# install awscli first. +if [[ -n "${AWS_CLI:-}" ]]; then + : # operator override; trust it verbatim +elif command -v aws >/dev/null; then + AWS_CLI="aws" +elif command -v docker >/dev/null; then + # Mount /tmp because mktemp puts the policy + tarball there; + # mount $HOME so absolute paths under $HOME (typical y-cluster + # cache locations) resolve inside the container; -w $PWD + + # -v $PWD:$PWD lets relative paths the operator typed work. + AWS_CLI="docker run --rm -i \ + -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION \ + -v $HOME:$HOME -v $PWD:$PWD -v /tmp:/tmp -w $PWD \ + public.ecr.aws/aws-cli/aws-cli" +else + echo "neither 'aws' nor 'docker' found; install one or set AWS_CLI" >&2 + exit 1 +fi + +export AWS_ACCESS_KEY_ID="$H_S3_ACCESS_KEY" +export AWS_SECRET_ACCESS_KEY="$H_S3_SECRET_KEY" +export AWS_DEFAULT_REGION="$REGION" + +aws_s3api() { + # shellcheck disable=SC2086 + $AWS_CLI s3api --endpoint-url "$ENDPOINT" "$@" +} +aws_s3() { + # shellcheck disable=SC2086 + $AWS_CLI s3 --endpoint-url "$ENDPOINT" "$@" +} + +# === Ensure bucket exists === +# head-bucket exits 0 if the bucket exists and we have access, +# nonzero with stderr "Not Found" / "Forbidden" otherwise. We +# only auto-create on Not Found; Forbidden means a name clash +# in another tenant and the operator should pick a different +# bucket name. +stage "checking bucket s3://$BUCKET (endpoint: $ENDPOINT)" +head_err=$(mktemp) +trap 'rm -f "$head_err"' EXIT +if aws_s3api head-bucket --bucket "$BUCKET" 2>"$head_err"; then + echo " bucket exists" +else + if grep -qiE '404|Not Found|NoSuchBucket' "$head_err"; then + stage "creating bucket s3://$BUCKET" + # Hetzner rejects LocationConstraint=us-east-1 (the + # AWS-CLI default for create-bucket without + # --create-bucket-configuration). Hetzner-region values + # work as the LocationConstraint. + aws_s3api create-bucket \ + --bucket "$BUCKET" \ + --create-bucket-configuration "LocationConstraint=$REGION" + else + echo "head-bucket failed and not a 404:" >&2 + cat "$head_err" >&2 + exit 1 + fi +fi + +# === Apply public-read-on-objects, no-listing policy === +# This is the "anonymous can curl any individual object whose +# key they already know, but cannot enumerate the bucket" +# pattern. We allow only s3:GetObject on the +# arn:aws:s3:::BUCKET/* resource; ListBucket on the bucket +# itself is omitted, so anonymous LIST is denied. +stage "applying public-read-objects policy" +policy_file=$(mktemp) +trap 'rm -f "$head_err" "$policy_file"' EXIT +cat > "$policy_file" </dev/null || true # y-script-lint:disable=or-true # not all S3-compat backends implement put-public-access-block; policy alone is sufficient on Hetzner + +# === Upload === +stage "uploading $SOURCE_FILE -> s3://$BUCKET/$KEY" +size=$(stat -c '%s' "$SOURCE_FILE") +echo " size: $size bytes ($(numfmt --to=iec-i --suffix=B "$size" 2>/dev/null || echo "$size B"))" + +# `aws s3 cp` handles multipart for >8MB by default and prints +# a progress bar to stderr; preferred over `s3api put-object` +# for arbitrary-sized files (qcow2 / vmdk are easily >5GB). +aws_s3 cp "$SOURCE_FILE" "s3://$BUCKET/$KEY" + +# === Verify the object is anonymously reachable === +# Use a fresh curl with no creds to confirm the policy actually +# took effect; surfaces config drift (e.g. another script +# overwriting the bucket policy) at publish time, not at +# customer-download time. +stage "verifying anonymous GET" +http_code=$(curl -sI -o /dev/null -w '%{http_code}' "$PUBLIC_URL") +if [[ "$http_code" != "200" ]]; then + echo "anonymous GET returned HTTP $http_code (expected 200)" >&2 + echo "URL: $PUBLIC_URL" >&2 + exit 1 +fi +echo " anonymous GET HTTP 200" + +# === Verify the bucket is NOT anonymously listable === +list_code=$(curl -sI -o /dev/null -w '%{http_code}' "https://${BUCKET}.${REGION}.your-objectstorage.com/") +case "$list_code" in + 403) echo " anonymous LIST denied (HTTP 403): correct" ;; + 200) echo "WARNING: anonymous LIST returned HTTP 200; the bucket is enumerable. Check the policy." >&2 ;; + *) echo " anonymous LIST returned HTTP $list_code" ;; +esac + +cat <.tar.gz containing a single disk.raw, the +# shape Compute Engine custom images expect. +# 5. PROMPT 2: confirm before any GCP-side write happens. +# 6. Upload tarball to GCS (creates bucket on first run). +# 7. gcloud compute images create from the GCS object +# (direct, no Cloud Build). +# 8. gcloud compute firewall-rules create (idempotent) for +# tcp:80 + tcp:443 on tagged instances. +# 9. gcloud compute instances create from the new image, +# tagged for the firewall rule. +# 10. Wait for ssh + probe HTTP. Print connection details. +# +# Aborting at PROMPT 1 leaves the local cluster running. +# Aborting at PROMPT 2 leaves the local bundle written but +# nothing in GCP. +# +# Every gcloud invocation passes --project=$GCP_PROJECT +# explicitly. Auth is the service-account JSON pointed at by +# $GOOGLE_APPLICATION_CREDENTIALS (created by +# scripts/gcp-bootstrap-credentials.sh). + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='appliance-qemu-to-gcp.sh - local provision -> hands-on -> export -> ship to GCP + +Usage: + appliance-qemu-to-gcp.sh [--reuse-disk=true|false] build + ship to GCP + appliance-qemu-to-gcp.sh teardown [--keep-disk=true|false] + delete VM + image + GCS object; + persistent disk preserved by + default (state-preservation + is the appliance design goal) + +Build flow disk handling: + --reuse-disk=true reuse existing /data/yolean disk (preserves customer state + across redeploys; the build image seed is no-op against an + already-seeded marker -- this is the production upgrade path) + --reuse-disk=false delete + recreate the disk (fresh disk lets the build image + seed extract; use this for QA / end-to-end seed validation) + (no flag, TTY) interactive prompt with default Y (reuse) + (no flag, no TTY) error: explicit choice required for non-interactive runs + +Teardown reads GCP_PROJECT / GCP_ZONE / GCP_BUCKET / VM_NAME / +GCP_DATADIR_DISK / NAME from the same env vars as the build +flow. Custom images and GCS objects are deleted by NAME prefix +(so different NAMEs in the same project do not clobber each +other). The persistent data disk, the bucket itself, and the +firewall rule are preserved unless --keep-disk=false is set +(legacy alias: --delete-data-disk). Local cluster cleanup (if +KEEP_LOCAL was set) is separate: y-cluster teardown -c \$CFG_DIR. + +Environment: + GCP_PROJECT GCP project (set in .env or shell env; required) + GCP_REGION GCP region (default: europe-north2 -- Stockholm) + GCP_ZONE GCP zone (default: europe-north2-a) + GCP_BUCKET GCS bucket for image tarballs + (default: -appliance-images) + GCP_MACHINE_TYPE Compute Engine machine type (default: e2-standard-2 -- + 2 vCPU / 8 GiB; the 4 GiB e2-medium OOMs the full + appliance stack mid-run. The type bundles CPU + memory, + so memory is not a separate knob.) + GCP_IMAGE_FAMILY Image family tag (default: y-cluster-appliance) + GCP_DATADIR_DISK Persistent disk for /data/yolean + (default: appliance-gcp-datadir; preserved on teardown) + GCP_DATADIR_SIZE Persistent disk size (default: 10GB; only used on create) + GCP_KEY Service account JSON (set in .env or shell env; required) + NAME Local cluster name (default: appliance-gcp-build). + Used as the prefix for the deliverable directory. + KUBECTX kubectl context name (default: local). Script + bails if a context with this name already + exists in your kubeconfig -- set KUBECTX to + something else, or delete the existing one. + IMAGE_NAME Custom image name in GCE (default: -) + VM_NAME Compute Engine VM name (default: $NAME) + APP_HTTP_PORT Override host port for guest 80 (y-cluster default: 80) + APP_HTTPS_PORT Override host port for guest 443 (y-cluster default: 443) + APP_API_PORT Override host port for guest 6443 (y-cluster default: 6443) + APP_SSH_PORT Override host port for guest 22 (y-cluster default: 2222) + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) + KEEP_LOCAL Set to keep the local cluster after upload (default: tear down) + KEEP_BUNDLE Set to keep the local export bundle (default: keep -- bundle path printed) + ASSUME_YES Skip BOTH confirmations and proceed end-to-end. + Also suppresses the optional TLS-LB prompt; set + TLS_DOMAINS alongside to opt in unattended. + APPLIANCE_SEED_CMD Optional shell cmd to run after echo install, + before PROMPT 1. Receives the + Y_CLUSTER_CURRENT_* env surface (build-side + ports, kubeconfig context, ssh-key path). + Typical: cd into a customer repo and apply + kustomize bases that populate /data/yolean. + Non-zero exit aborts; local cluster left up. + APPLIANCE_VERIFY_CMD Optional shell cmd to run after the GCP + deploy + optional TLS LB, before final + summary. Receives the same Y_CLUSTER_CURRENT_* + surface plus REMOTE_VM_IP / REMOTE_LB_IP / + REMOTE_DOMAINS / REMOTE_SCHEME so a remote + probe can curl --resolve through the + deployed VM. Non-zero exit aborts; VM and + LB stay up for inspection. + TLS_DOMAINS Comma-separated FQDNs for an optional regional + External HTTPS LoadBalancer with a self-signed + cert (e.g., appliance.example.com,admin.appliance.example.com). + Empty: skip the LB step. The HTTPRoutes must + already match these hostnames. + Special value "auto": derive the FQDN list from + `y-cluster gateway hostnames --csv` against the + just-provisioned cluster -- reconciled + HTTPRoute / GRPCRoute hostnames become the LB + cert SAN list, so the two can never drift. + Aborts with an error when "auto" is set but the + cluster has no non-wildcard hostnames yet. + +Dependencies: + go, qemu-system-x86_64, qemu-img, kubectl, ssh, ssh-keygen, curl, + virt-sysprep, gcloud +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${GCP_PROJECT:?set GCP_PROJECT in .env or shell env}" +: "${GCP_KEY:?set GCP_KEY in .env or shell env}" + +GCP_REGION="${GCP_REGION:-europe-north2}" +GCP_ZONE="${GCP_ZONE:-europe-north2-a}" +GCP_BUCKET="${GCP_BUCKET:-${GCP_PROJECT}-appliance-images}" +# e2-standard-2: 2 vCPU / 8 GiB. The 4 GiB e2-medium OOMs once the +# appliance is running a non-trivial workload stack; 8 GiB is the +# floor we have validated. GCE machine types bundle CPU + memory so +# a separate memory knob is not meaningful -- override the whole +# type via GCP_MACHINE_TYPE for highmem / larger shapes. +GCP_MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-standard-2}" +GCP_IMAGE_FAMILY="${GCP_IMAGE_FAMILY:-y-cluster-appliance}" +GCP_DATADIR_DISK="${GCP_DATADIR_DISK:-appliance-gcp-datadir}" +GCP_DATADIR_SIZE="${GCP_DATADIR_SIZE:-10GB}" + +NAME="${NAME:-appliance-gcp-build}" +KUBECTX="${KUBECTX:-local}" +IMAGE_NAME="${IMAGE_NAME:-${NAME}-$(date -u +%Y%m%d-%H%M%S)}" +VM_NAME="${VM_NAME:-$NAME}" + +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" +CFG_DIR="${CFG_DIR:-$HOME/.cache/y-cluster-appliance-build/$NAME}" +# Top-level deliverable dir. Holds two per-format subdirs -- +# `gcp-tar/` (uploaded to Compute Engine here) and `ova/` +# (handed to a customer for VirtualBox / VMware Import +# Appliance). Both subdirs are byte-equivalent disk states; +# the only differences are the on-the-wire format and the +# README boot instructions. +BUNDLE_DIR="${BUNDLE_DIR:-$REPO_ROOT/dist/appliance/$NAME-$(date -u +%Y%m%dT%H%M%SZ)}" + +stage() { printf '\n=== %s ===\n' "$*"; } +confirm() { + local prompt=$1 + if [[ -n "${ASSUME_YES:-}" ]]; then + echo "ASSUME_YES set; proceeding ($prompt)" + return 0 + fi + read -r -p "$prompt [y/N] " answer + case "${answer,,}" in + y|yes) return 0 ;; + *) return 1 ;; + esac +} + +# prompt_yes_default is for irreversible decisions where the +# default-on-Enter is YES (state preservation, design-goal aligned) +# but the operator must EXPLICITLY pre-answer for non-interactive +# runs. Distinct from confirm() in two ways: +# - default is Y, not N +# - ASSUME_YES is NOT consulted; non-TTY callers without an +# explicit flag get a clear error instead of a silent default. +# Returns 0 for yes, 1 for no, exits 2 on no-TTY-no-flag. +prompt_yes_default() { + local prompt=$1 missing_flag_hint=$2 + if [[ ! -t 0 ]]; then + echo "non-interactive shell: $missing_flag_hint" >&2 + exit 2 + fi + read -r -p "$prompt [Y/n] " answer + case "${answer,,}" in + n|no) return 1 ;; + *) return 0 ;; + esac +} + +# current_env exports the Y_CLUSTER_CURRENT_* surface a hook +# cmd reads via printenv. Call right before invoking the cmd +# so any vars computed since the last invocation (PUBLIC_IP, +# BUNDLE_DIR, TLS-LB IP) are picked up. Vars not yet known at +# the call site are exported as empty strings (not unset) so +# a verify script can read them unconditionally. +current_env() { + export Y_CLUSTER_CURRENT_NAME="$NAME" + export Y_CLUSTER_CURRENT_KUBECTX="$KUBECTX" + export Y_CLUSTER_CURRENT_LOCAL_HTTP_PORT="${APP_HTTP_PORT:-80}" + export Y_CLUSTER_CURRENT_LOCAL_HTTPS_PORT="${APP_HTTPS_PORT:-443}" + export Y_CLUSTER_CURRENT_LOCAL_API_PORT="${APP_API_PORT:-6443}" + export Y_CLUSTER_CURRENT_LOCAL_SSH_PORT="${APP_SSH_PORT:-2222}" + export Y_CLUSTER_CURRENT_LOCAL_SSH_KEY="${CACHE_DIR:-}/${NAME}-ssh" + export Y_CLUSTER_CURRENT_BUNDLE_DIR="${BUNDLE_DIR:-}" + export Y_CLUSTER_CURRENT_REMOTE_VM_NAME="${VM_NAME:-}" + export Y_CLUSTER_CURRENT_REMOTE_VM_IP="${PUBLIC_IP:-}" + export Y_CLUSTER_CURRENT_REMOTE_DOMAINS="${TLS_DOMAINS:-}" + if [[ -n "${TLS_DOMAINS:-}" ]]; then + # do_tls_frontend keeps lb_ip in local scope; re-query + # gcloud here so the verify hook can read it. + Y_CLUSTER_CURRENT_REMOTE_LB_IP=$(gcloud compute addresses describe "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(address)' 2>/dev/null \ + || true) # y-script-lint:disable=or-true # absent IP -> empty var, hook decides + export Y_CLUSTER_CURRENT_REMOTE_LB_IP + export Y_CLUSTER_CURRENT_REMOTE_SCHEME=https + else + export Y_CLUSTER_CURRENT_REMOTE_LB_IP="" + export Y_CLUSTER_CURRENT_REMOTE_SCHEME=http + fi + export Y_CLUSTER_CURRENT_GCP_PROJECT="$GCP_PROJECT" + export Y_CLUSTER_CURRENT_GCP_ZONE="$GCP_ZONE" + export Y_CLUSTER_CURRENT_GCP_REGION="$GCP_REGION" +} + +# do_teardown deletes GCP resources owned by this script's +# NAME prefix in the configured project + zone. Reads the +# same env vars as the build flow so a teardown after a +# customised build (e.g., NAME=customer-foo) cleans up +# exactly that customer's resources without touching other +# NAMEs that share the same project. +do_teardown() { + # delete_data_disk: 0=keep (default; design-goal state preservation), + # 1=delete. --keep-disk=true|false is the explicit form; + # --delete-data-disk is the legacy alias mapped to --keep-disk=false. + local delete_data_disk=0 + while [[ $# -gt 0 ]]; do + case "$1" in + --keep-disk=true) delete_data_disk=0 ;; + --keep-disk=false) delete_data_disk=1 ;; + --delete-data-disk) + echo " note: --delete-data-disk is deprecated; prefer --keep-disk=false" + delete_data_disk=1 + ;; + *) echo "unknown teardown flag: $1" >&2; exit 2 ;; + esac + shift + done + + stage "inventory in $GCP_PROJECT / $GCP_ZONE" + local vm images objects disk + vm=$(gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --format='value(name)' 2>/dev/null) \ + || true # y-script-lint:disable=or-true # missing VM is not an error + images=$(gcloud compute images list \ + --project="$GCP_PROJECT" \ + --no-standard-images \ + --filter="name~^${NAME}-" \ + --format='value(name)' 2>/dev/null) \ + || true # y-script-lint:disable=or-true # empty list is not an error + objects=$(gcloud storage ls "gs://$GCP_BUCKET/${NAME}-*.tar.gz" \ + --project="$GCP_PROJECT" 2>/dev/null) \ + || true # y-script-lint:disable=or-true # missing bucket / no objects is not an error + disk=$(gcloud compute disks describe "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --format='value(name)' 2>/dev/null) \ + || true # y-script-lint:disable=or-true # missing disk is not an error + + echo + echo "Will DELETE:" + [[ -n "$vm" ]] && echo " VM: $VM_NAME ($GCP_ZONE)" + if [[ -n "$images" ]]; then + echo "$images" | sed 's/^/ Image: /' + fi + if [[ -n "$objects" ]]; then + echo "$objects" | sed 's|^| GCS object: |' + fi + if [[ $delete_data_disk -eq 1 && -n "$disk" ]]; then + echo " Data disk: $GCP_DATADIR_DISK (PERSISTENT DATA WILL BE LOST)" + fi + # If a TLS LB stack exists, do_tls_teardown will pick it up. + # We don't enumerate every resource here -- the function logs + # `deleting TLS LB stack ...` when it fires. + if gcloud compute forwarding-rules describe "${NAME}-tls-fr" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null | grep -q .; then + echo " TLS LB stack: ${NAME}-tls-* (forwarding rule + 8 dependents)" + fi + echo + echo "Will PRESERVE:" + if [[ $delete_data_disk -eq 0 && -n "$disk" ]]; then + echo " Data disk: $GCP_DATADIR_DISK (--keep-disk=false to also remove)" + fi + echo " GCS bucket: gs://$GCP_BUCKET (objects matching $NAME-* deleted above)" + echo " Firewall rule: y-cluster-appliance-public (tag-based, shared)" + echo + + if [[ -z "$vm" && -z "$images" && -z "$objects" ]] \ + && { [[ $delete_data_disk -eq 0 ]] || [[ -z "$disk" ]]; }; then + echo "Nothing to delete." + exit 0 + fi + + confirm "Proceed with teardown?" \ + || { echo "aborted; nothing deleted."; exit 0; } + + if [[ -n "$vm" ]]; then + stage "deleting VM $VM_NAME" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + fi + if [[ -n "$images" ]]; then + stage "deleting custom images ($(echo "$images" | wc -l))" + # shellcheck disable=SC2086 + echo "$images" | xargs -r -I{} \ + gcloud compute images delete {} --project="$GCP_PROJECT" --quiet + fi + if [[ -n "$objects" ]]; then + stage "deleting GCS objects ($(echo "$objects" | wc -l))" + # shellcheck disable=SC2086 + echo "$objects" | xargs -r \ + gcloud storage rm --project="$GCP_PROJECT" + fi + if [[ $delete_data_disk -eq 1 && -n "$disk" ]]; then + stage "deleting persistent data disk $GCP_DATADIR_DISK" + gcloud compute disks delete "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + fi + + do_tls_teardown + stage "teardown complete" + + # Surface the preservation contract at the moment the operator + # is about to step away. Previously this only appeared in the + # build-flow success block, where it was less actionable. + if [[ $delete_data_disk -eq 0 && -n "$disk" ]]; then + cat </dev/null + chmod 600 "$cert_dir/privkey.pem" + + # Proxy-only subnet: required by regional EXTERNAL_MANAGED LBs, + # one ACTIVE per region+VPC. Reuse if any exists; otherwise + # create a per-build one we can clean up on teardown. + stage "ensuring proxy-only subnet in $GCP_REGION" + if gcloud compute networks subnets list \ + --project="$GCP_PROJECT" \ + --filter "region:$GCP_REGION AND purpose=REGIONAL_MANAGED_PROXY AND role=ACTIVE" \ + --format='value(name)' 2>/dev/null | grep -q .; then + echo " reusing existing proxy-only subnet" + else + gcloud compute networks subnets create "${NAME}-tls-proxy-subnet" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --network=default --range=192.168.42.0/24 \ + --purpose=REGIONAL_MANAGED_PROXY --role=ACTIVE >/dev/null + fi + + stage "reserving regional external IP ${NAME}-tls-ip" + if ! gcloud compute addresses describe "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute addresses create "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --network-tier=STANDARD >/dev/null + fi + local lb_ip + lb_ip=$(gcloud compute addresses describe "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(address)') + + stage "uploading SSL cert ${NAME}-tls-cert" + if ! gcloud compute ssl-certificates describe "${NAME}-tls-cert" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute ssl-certificates create "${NAME}-tls-cert" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --certificate="$cert_dir/fullchain.pem" \ + --private-key="$cert_dir/privkey.pem" >/dev/null + fi + + stage "creating health check ${NAME}-tls-hc" + if ! gcloud compute health-checks describe "${NAME}-tls-hc" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute health-checks create http "${NAME}-tls-hc" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --port=80 --request-path=/q/envoy/echo \ + --check-interval=10s --timeout=5s >/dev/null + fi + + stage "creating network endpoint group ${NAME}-tls-neg" + if ! gcloud compute network-endpoint-groups describe "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + gcloud compute network-endpoint-groups create "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --network-endpoint-type=GCE_VM_IP_PORT --default-port=80 >/dev/null + fi + # Always (re-)attach the VM endpoint, regardless of whether + # the NEG already existed. GCE auto-removes endpoints when + # the instance they reference is deleted, which Stage 9 does + # on every build (delete+recreate the VM for idempotency). + # If the NEG survived from a prior run but the VM was recreated, + # its endpoint reference is gone and the LB has no backend. + # Skip the add when the endpoint is already attached so re-runs + # without VM recreation stay quiet. + if ! gcloud compute network-endpoint-groups list-network-endpoints "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --format='value(instance)' 2>/dev/null | grep -Fxq "$VM_NAME"; then + gcloud compute network-endpoint-groups update "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --add-endpoint="instance=$VM_NAME,port=80" >/dev/null + fi + + stage "creating backend service ${NAME}-tls-backend" + if ! gcloud compute backend-services describe "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute backend-services create "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --load-balancing-scheme=EXTERNAL_MANAGED --protocol=HTTP \ + --health-checks="${NAME}-tls-hc" \ + --health-checks-region="$GCP_REGION" >/dev/null + gcloud compute backend-services add-backend "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --network-endpoint-group="${NAME}-tls-neg" \ + --network-endpoint-group-zone="$GCP_ZONE" \ + --balancing-mode=RATE --max-rate-per-endpoint=100 >/dev/null + fi + + stage "creating URL map ${NAME}-tls-urlmap" + if ! gcloud compute url-maps describe "${NAME}-tls-urlmap" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute url-maps create "${NAME}-tls-urlmap" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --default-service="projects/$GCP_PROJECT/regions/$GCP_REGION/backendServices/${NAME}-tls-backend" >/dev/null + fi + + stage "creating target HTTPS proxy ${NAME}-tls-proxy" + if ! gcloud compute target-https-proxies describe "${NAME}-tls-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute target-https-proxies create "${NAME}-tls-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --url-map="${NAME}-tls-urlmap" \ + --ssl-certificates="${NAME}-tls-cert" >/dev/null + fi + + stage "creating forwarding rule ${NAME}-tls-fr (:443)" + if ! gcloud compute forwarding-rules describe "${NAME}-tls-fr" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute forwarding-rules create "${NAME}-tls-fr" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --load-balancing-scheme=EXTERNAL_MANAGED --network-tier=STANDARD \ + --network=default --address="${NAME}-tls-ip" \ + --target-https-proxy="${NAME}-tls-proxy" \ + --target-https-proxy-region="$GCP_REGION" --ports=443 >/dev/null + fi + + # === HTTP -> HTTPS redirect chain === + # GCP regional EXTERNAL_MANAGED URL maps can do a default redirect + # but `gcloud compute url-maps create` has no flag for it -- we + # have to import a YAML body. A URL map can have either + # `defaultService` (forward) or `defaultUrlRedirect` (redirect), + # not both, hence the second URL map + second target proxy + second + # forwarding rule sharing the same reserved IP. + stage "creating redirect URL map ${NAME}-tls-redirect (HTTP -> HTTPS)" + if ! gcloud compute url-maps describe "${NAME}-tls-redirect" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute url-maps import "${NAME}-tls-redirect" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --source=- --quiet >/dev/null </dev/null 2>&1; then + gcloud compute target-http-proxies create "${NAME}-tls-http-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --url-map="${NAME}-tls-redirect" \ + --url-map-region="$GCP_REGION" >/dev/null + fi + + stage "creating forwarding rule ${NAME}-tls-fr-http (:80 -> redirect)" + if ! gcloud compute forwarding-rules describe "${NAME}-tls-fr-http" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + gcloud compute forwarding-rules create "${NAME}-tls-fr-http" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --load-balancing-scheme=EXTERNAL_MANAGED --network-tier=STANDARD \ + --network=default --address="${NAME}-tls-ip" \ + --target-http-proxy="${NAME}-tls-http-proxy" \ + --target-http-proxy-region="$GCP_REGION" --ports=80 >/dev/null + fi + + cat < 301 redirect to :443 (so plain http:// works + as long as the client follows redirects, e.g. curl -L) + +To test from another machine, append this single line to /etc/hosts: + + $lb_ip ${domains_csv//,/ } + +For a real cert (cert-manager / Let's Encrypt), upload a fresh PEM ++ key as ${NAME}-tls-cert-vN, then point the proxy at it via +\`gcloud compute target-https-proxies update ${NAME}-tls-proxy +--ssl-certificates=${NAME}-tls-cert-vN --region=$GCP_REGION\`. +================================================================ + +EOF +} + +# do_tls_teardown deletes everything do_tls_frontend created. +# Idempotent: missing resources are not errors. Order matters -- +# the forwarding rule has to go before the proxy/url-map/backend +# chain, and the IP after. +do_tls_teardown() { + local fr fr_http proxy http_proxy urlmap urlmap_redirect backend neg hc cert ip subnet + fr=$(gcloud compute forwarding-rules describe "${NAME}-tls-fr" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing fr is not an error + fr_http=$(gcloud compute forwarding-rules describe "${NAME}-tls-fr-http" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing :80 redirect fr is not an error + proxy=$(gcloud compute target-https-proxies describe "${NAME}-tls-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing proxy is not an error + http_proxy=$(gcloud compute target-http-proxies describe "${NAME}-tls-http-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing :80 redirect proxy is not an error + urlmap=$(gcloud compute url-maps describe "${NAME}-tls-urlmap" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing url-map is not an error + urlmap_redirect=$(gcloud compute url-maps describe "${NAME}-tls-redirect" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing redirect url-map is not an error + backend=$(gcloud compute backend-services describe "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing backend is not an error + neg=$(gcloud compute network-endpoint-groups describe "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing neg is not an error + hc=$(gcloud compute health-checks describe "${NAME}-tls-hc" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing hc is not an error + cert=$(gcloud compute ssl-certificates describe "${NAME}-tls-cert" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing cert is not an error + ip=$(gcloud compute addresses describe "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing ip is not an error + subnet=$(gcloud compute networks subnets describe "${NAME}-tls-proxy-subnet" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" \ + --format='value(name)' 2>/dev/null) || true # y-script-lint:disable=or-true # missing subnet is not an error + + if [[ -z "$fr$fr_http$proxy$http_proxy$urlmap$urlmap_redirect$backend$neg$hc$cert$ip$subnet" ]]; then + return + fi + + stage "deleting TLS LB stack (${NAME}-tls-*)" + # Forwarding rules first (they reference proxies) -- both :443 + # and the :80 redirect. + [[ -n "$fr" ]] && gcloud compute forwarding-rules delete "${NAME}-tls-fr" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$fr_http" ]] && gcloud compute forwarding-rules delete "${NAME}-tls-fr-http" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + # Then proxies (they reference URL maps). + [[ -n "$proxy" ]] && gcloud compute target-https-proxies delete "${NAME}-tls-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$http_proxy" ]] && gcloud compute target-http-proxies delete "${NAME}-tls-http-proxy" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + # Then URL maps (the :443 backend-pointing one + the :80 redirect one). + [[ -n "$urlmap" ]] && gcloud compute url-maps delete "${NAME}-tls-urlmap" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$urlmap_redirect" ]] && gcloud compute url-maps delete "${NAME}-tls-redirect" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$backend" ]] && gcloud compute backend-services delete "${NAME}-tls-backend" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$neg" ]] && gcloud compute network-endpoint-groups delete "${NAME}-tls-neg" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + [[ -n "$hc" ]] && gcloud compute health-checks delete "${NAME}-tls-hc" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$cert" ]] && gcloud compute ssl-certificates delete "${NAME}-tls-cert" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + [[ -n "$ip" ]] && gcloud compute addresses delete "${NAME}-tls-ip" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + # Subnet last: only delete the per-build one (do_tls_frontend + # never creates a subnet that already exists, so anything named + # ${NAME}-tls-proxy-subnet was definitely ours). + [[ -n "$subnet" ]] && gcloud compute networks subnets delete "${NAME}-tls-proxy-subnet" \ + --project="$GCP_PROJECT" --region="$GCP_REGION" --quiet >/dev/null + # Force a 0 return: the [[ -n "$subnet" ]] && ... pattern above + # returns 1 when $subnet is empty (subnet was reused, not + # created by this run). Without this, set -e in the caller + # treats the function as failed and aborts before the + # "teardown complete" stage + the PRESERVED message can fire. + return 0 +} + +# Minimal pre-checks shared by build and teardown: gcloud +# binary + GCP key + activation. The build flow does +# additional tool checks below the dispatch. +command -v gcloud >/dev/null \ + || { echo "missing required tool: gcloud" >&2; exit 1; } + +if [[ ! -f "$GCP_KEY" ]]; then + echo "missing GCP key: $GCP_KEY" >&2 + echo "create it with: scripts/gcp-bootstrap-credentials.sh on a machine with gcloud Owner access" >&2 + exit 1 +fi +# Fail fast on a malformed key file (truncated, wrong export +# format, expired exports that lost their private_key) BEFORE +# gcloud activate-service-account errors with a less helpful +# message. The four fields below are the minimum GCP requires +# to authenticate as a service account. +if ! jq -e 'all(.type == "service_account"; .) and (.project_id // empty | length > 0) and (.client_email // empty | length > 0) and (.private_key // empty | length > 0)' "$GCP_KEY" >/dev/null 2>&1; then + echo "GCP key at $GCP_KEY is missing required fields" >&2 + echo " expected JSON with: type=service_account, project_id, client_email, private_key" >&2 + echo " regenerate via: scripts/gcp-bootstrap-credentials.sh" >&2 + exit 1 +fi +export GOOGLE_APPLICATION_CREDENTIALS="$GCP_KEY" + +# Acknowledge parallel composite uploads up front. The setting +# both turns on multi-stream uploads (which is what we want for +# 1.5+ GiB tarballs) AND silences the WARNING stanza gcloud +# would otherwise emit on every `storage cp`. Env-var form so +# we don't mutate the operator's gcloud config. +export CLOUDSDK_STORAGE_PARALLEL_COMPOSITE_UPLOAD_ENABLED=True + +stage "activating GCP service account ($GCP_KEY)" +gcloud auth activate-service-account --key-file="$GCP_KEY" --project="$GCP_PROJECT" >/dev/null + +# Subcommand dispatch. Teardown only needs gcloud + GCP_KEY, +# both verified above; doesn't need go / qemu-img / etc. so +# the build-flow tool check below stays out of its path. +if [[ "${1:-}" = "teardown" ]]; then + shift + do_teardown "$@" + exit 0 +fi + +# Build-flow arg parsing. Today: just --reuse-disk=true|false. +# Empty REUSE_DISK + interactive prompt at Stage 8.5 if the disk +# already exists; non-interactive runs without the flag error +# out at Stage 8.5 with a clear message. +REUSE_DISK="" +while [[ $# -gt 0 ]]; do + case "$1" in + --reuse-disk=true) REUSE_DISK=true ;; + --reuse-disk=false) REUSE_DISK=false ;; + *) echo "unknown build flag: $1" >&2; exit 2 ;; + esac + shift +done + +# Fail early if ASSUME_YES is set without an explicit disk +# decision: ASSUME_YES means "unattended; don't prompt me", and +# disk handling is irreversible (--reuse-disk=false destroys +# customer state). Don't let that slip through to a prompt at +# Stage 8.5 that would either hang (TTY-less) or be answered +# by a default the operator never deliberately picked. +if [[ -n "${ASSUME_YES:-}" && -z "$REUSE_DISK" ]]; then + echo "ASSUME_YES set without --reuse-disk=true|false: refusing to" >&2 + echo "guess at an irreversible decision. Pick one explicitly:" >&2 + echo " --reuse-disk=true preserve customer state (production upgrade)" >&2 + echo " --reuse-disk=false delete + recreate (QA seed validation)" >&2 + exit 2 +fi + +# Build-flow tool check (additional to gcloud above). +for tool in go qemu-system-x86_64 qemu-img kubectl ssh ssh-keygen curl virt-sysprep; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +# virt-sysprep needs to read /boot/vmlinuz-* (libguestfs supermin). +if ! [ -r /boot/vmlinuz-"$(uname -r)" ]; then + cat >&2 < $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +mkdir -p "$CFG_DIR" +# YAML emission omits any port the operator didn't override, letting +# y-cluster's Go binary apply its own defaults (sshPort=2222, +# portForwards={6443:6443, 80:80, 443:443}). Set APP_*_PORT to take +# different values; otherwise the script doesn't restate y-cluster's +# defaults in two places. +{ + echo "provider: qemu" + echo "name: $NAME" + echo "context: $KUBECTX" + [ -n "${APP_SSH_PORT:-}" ] && printf 'sshPort: "%s"\n' "$APP_SSH_PORT" + echo 'memory: "8192"' + echo 'cpus: "2"' + echo 'diskSize: "40G"' + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + echo "portForwards:" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" + fi +} > "$CFG_DIR/y-cluster-provision.yaml" + +stage "tearing down any leftover $NAME cluster" +"$Y_CLUSTER" teardown -c "$CFG_DIR" || true # y-script-lint:disable=or-true # idempotent re-entry: missing cluster is not an error + +# Bail-out guard: our own teardown above would have removed +# the kubectl context THIS script registered on a previous +# run. A surviving "$KUBECTX" entry means something else owns +# it (e.g., a parallel y-cluster cluster, or the operator's +# personal "local" dev cluster). We refuse to clobber. +if kubectl config get-contexts -o name 2>/dev/null | grep -Fxq "$KUBECTX"; then + echo "kubectl context '$KUBECTX' already exists and is not owned by this script." >&2 + echo " Either remove it: kubectl config delete-context $KUBECTX" >&2 + echo " Or pick a new name: KUBECTX=appliance-qa $0" >&2 + exit 1 +fi + +stage "provisioning $NAME (k3s + Envoy Gateway)" +"$Y_CLUSTER" provision -c "$CFG_DIR" + +# Echo is what creates the Gateway listener (not just the +# Envoy Gateway controller -- the actual Gateway resource that +# binds :80). Without it, any HTTPRoute the operator applies +# in the hands-on window has nothing to attach to and curl +# returns "connection refused" both locally and on the eventual +# GCP VM. Auto-install so the Gateway listener is up by default; +# operators can still delete + replace echo with their own +# workload (the Gateway listener stays, the routing changes). +stage "installing echo workload (Gateway listener + baseline route)" +"$Y_CLUSTER" echo render \ + | kubectl --context="$KUBECTX" apply --server-side --field-manager=appliance-build -f - +kubectl --context="$KUBECTX" -n y-cluster wait \ + --for=condition=Available deployment/echo --timeout=180s + +# Seed hook: caller-supplied cmd runs after echo is up but +# before PROMPT 1 / TLS_DOMAINS=auto resolution. Customer +# workloads applied here (mariadb, kafka, keycloak, HTTPRoute / +# GRPCRoute resources, etc.) populate /data/yolean for the data-seed +# extraction in prepare-export AND give TLS_DOMAINS=auto real +# hostnames to derive from. Non-zero exit aborts; local +# cluster stays up for inspection (set -e + the +# "aborted; local cluster left running" semantics of the +# upcoming PROMPT 1 path are what the operator falls back on). +if [[ -n "${APPLIANCE_SEED_CMD:-}" ]]; then + stage "applying seed (APPLIANCE_SEED_CMD)" + current_env + # set -o pipefail so a `cmd | tee log` chain in the + # caller's string doesn't swallow upstream failures. + bash -c "set -o pipefail; $APPLIANCE_SEED_CMD" +fi + +# === Stage 2: hands-on prompt === +SSH_KEY="$CACHE_DIR/$NAME-ssh" +cat <&2 + echo " Apply HTTPRoutes with .spec.hostnames first, or set TLS_DOMAINS=foo,bar to override." >&2 + exit 1 + } + echo " TLS_DOMAINS=$TLS_DOMAINS" +fi + +# === Stage 3: prepare-export + export gcp-tar === +# prepare-export needs the cluster RUNNING: its live phase +# clears the per-deploy yolean.se/dns-hint-ip annotation and +# snapshots reconciled Gateway state into /- +# gateway-state.json. It then stops the VM itself before the +# offline phase (libguestfs needs the disk not in use). +# Calling `y-cluster stop` here would defeat that. +stage "prepare-export ($NAME)" +"$Y_CLUSTER" prepare-export --context="$KUBECTX" + +# Dual export to per-format subdirs of the deliverable. +# Both reads come from the same prepare-export'd qcow2 so +# the disk state is byte-identical; the only differences are +# the on-the-wire packaging (tar.gz with disk.raw vs OVF + +# streamOptimized VMDK in tar) and the per-format README. +# The SSH keypair `-ssh{,.pub}` lands in both subdirs; +# the pair is identical (one keypair was generated at +# provision time, both export passes copy from the same +# source under $CACHE_DIR). +mkdir -p "$BUNDLE_DIR" + +stage "exporting Compute Engine image format -> $BUNDLE_DIR/gcp-tar" +"$Y_CLUSTER" export --context="$KUBECTX" --format=gcp-tar "$BUNDLE_DIR/gcp-tar" + +stage "exporting OVA (VirtualBox / VMware Import Appliance) -> $BUNDLE_DIR/ova" +"$Y_CLUSTER" export --context="$KUBECTX" --format=ova "$BUNDLE_DIR/ova" + +ls -lh "$BUNDLE_DIR"/*/ +TARBALL="$BUNDLE_DIR/gcp-tar/$NAME.tar.gz" + +# === Stage 4: confirm before any GCP write === +cat </dev/null || stat -c '%s' "$TARBALL") + +Next: upload to gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz, create a +GCE custom image, ensure firewall opens tcp:80 + tcp:443 on +tagged VMs, create $VM_NAME ($GCP_MACHINE_TYPE in $GCP_ZONE) +from the image. Aborting now leaves the bundle on local disk +unchanged. +================================================================ + +EOF + +confirm "Upload $TARBALL to GCS and create VM in $GCP_PROJECT?" \ + || { echo "aborted; bundle preserved at $BUNDLE_DIR."; exit 0; } + +# === Stage 5: GCS bucket (idempotent) === +stage "ensuring GCS bucket gs://$GCP_BUCKET (location $GCP_REGION)" +if ! gcloud storage buckets describe "gs://$GCP_BUCKET" --project="$GCP_PROJECT" >/dev/null 2>&1; then + gcloud storage buckets create "gs://$GCP_BUCKET" \ + --project="$GCP_PROJECT" \ + --location="$GCP_REGION" \ + --uniform-bucket-level-access +else + echo " bucket exists" +fi + +# === Stage 6: upload tarball === +stage "uploading $TARBALL -> gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" +gcloud storage cp "$TARBALL" "gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" --project="$GCP_PROJECT" + +# === Stage 7: create custom image === +stage "creating GCE custom image $IMAGE_NAME (family $GCP_IMAGE_FAMILY)" +gcloud compute images create "$IMAGE_NAME" \ + --project="$GCP_PROJECT" \ + --source-uri="gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" \ + --family="$GCP_IMAGE_FAMILY" \ + --architecture=X86_64 \ + >/dev/null + +# === Stage 8: firewall rule (idempotent) === +FIREWALL_RULE="y-cluster-appliance-public" +stage "ensuring firewall rule $FIREWALL_RULE (tcp:80,443 -> y-cluster-appliance tag)" +if ! gcloud compute firewall-rules describe "$FIREWALL_RULE" --project="$GCP_PROJECT" >/dev/null 2>&1; then + gcloud compute firewall-rules create "$FIREWALL_RULE" \ + --project="$GCP_PROJECT" \ + --direction=INGRESS \ + --network=default \ + --action=ALLOW \ + --rules=tcp:80,tcp:443 \ + --target-tags=y-cluster-appliance \ + --source-ranges=0.0.0.0/0 \ + >/dev/null +else + echo " rule exists" +fi + +# === Stage 8.5: ensure persistent data disk === +# Persistent disk attached to the VM and mounted at /data/yolean +# (the bundled local-path-provisioner's default storage root). +# Survives instance redeploys: tear down the VM, redeploy with a +# fresh image, the same /data/yolean comes back. Disk auto-delete +# is OFF when attaching an existing disk via --disk=name=, so +# `instances delete` won't wipe it. +# +# Reuse decision (when the disk already exists): +# --reuse-disk=true reuse silently (preserves customer state -- the +# production upgrade path; data-seed unit no-ops +# on the existing marker, so the new image's seed +# is correctly NOT applied) +# --reuse-disk=false delete + recreate (fresh disk lets the build +# image's seed extract -- the QA validation path) +# no flag, TTY interactive prompt, default Y (reuse) +# no flag, no TTY error + exit (explicit choice required for +# irreversible decisions in unattended runs) +stage "ensuring persistent data disk $GCP_DATADIR_DISK (size only used on create: $GCP_DATADIR_SIZE)" +if gcloud compute disks describe "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + case "$REUSE_DISK" in + true) + echo " disk exists -- reusing (--reuse-disk=true; preserves customer state)" + ;; + false) + echo " disk exists -- --reuse-disk=false: deleting and recreating" + # Detach by deleting the VM first if it's still attached. + # Stage 9 normally handles VM deletion for idempotency; we + # do it here too because gcloud compute disks delete refuses + # while the disk is mounted on a running instance. + if gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + echo " $VM_NAME exists, deleting first to release disk" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + fi + gcloud compute disks delete "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + gcloud compute disks create "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --size="$GCP_DATADIR_SIZE" \ + --type=pd-balanced \ + >/dev/null + echo " disk recreated (fresh; will be ext4-formatted on first mount)" + ;; + *) + if prompt_yes_default \ + " Reuse existing data disk $GCP_DATADIR_DISK with its preserved state?" \ + "pass --reuse-disk=true (preserve state) or --reuse-disk=false (delete + reseed)"; then + echo " reusing (preserves customer state)" + else + echo " --reuse-disk=false chosen: deleting and recreating" + if gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + echo " $VM_NAME exists, deleting first to release disk" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + fi + gcloud compute disks delete "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null + gcloud compute disks create "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --size="$GCP_DATADIR_SIZE" \ + --type=pd-balanced \ + >/dev/null + echo " disk recreated (fresh; will be ext4-formatted on first mount)" + fi + ;; + esac +else + gcloud compute disks create "$GCP_DATADIR_DISK" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --size="$GCP_DATADIR_SIZE" \ + --type=pd-balanced \ + >/dev/null + echo " disk created (fresh; will be ext4-formatted on first mount)" +fi + +# === Stage 9: create VM (delete first if exists for idempotency) === +stage "creating $VM_NAME ($GCP_MACHINE_TYPE in $GCP_ZONE) from image $IMAGE_NAME" +if gcloud compute instances describe "$VM_NAME" --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + echo " $VM_NAME exists, deleting first" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null +fi +# device-name=datadir is what GCE writes after the +# `scsi-0Google_PersistentDisk_` prefix in /dev/disk/by-id/ +# inside the VM; the SSH-side mount block uses that stable path +# regardless of /dev/sd* enumeration order. +gcloud compute instances create "$VM_NAME" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --machine-type="$GCP_MACHINE_TYPE" \ + --image="$IMAGE_NAME" \ + --image-project="$GCP_PROJECT" \ + --boot-disk-size=40GB \ + --disk="name=$GCP_DATADIR_DISK,device-name=datadir,mode=rw,boot=no" \ + --tags=y-cluster-appliance \ + >/dev/null + +PUBLIC_IP=$(gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --format='get(networkInterfaces[0].accessConfigs[0].natIP)') +echo " public ip: $PUBLIC_IP" + +# === Stage 10: wait for ssh + probe === +# SSH_KEY (from CACHE_DIR) was used by the local cluster but is +# wiped by `y-cluster teardown` at the end of this flow. The +# bundle-dir copy is what the operator can reach the GCP VM +# with afterwards. Switch to the bundle path BEFORE teardown +# runs so subsequent prints reference the path that'll exist. +SSH_KEY="$BUNDLE_DIR/gcp-tar/$NAME-ssh" +SSH_OPTS="-i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" +echo " waiting for ssh on $PUBLIC_IP:22 (cloud-init can take 30-90s on first boot)" +ssh_up=0 +for i in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS -p 22 ystack@"$PUBLIC_IP" 'true' 2>/dev/null; then + echo " ssh up after $i attempt(s)" + ssh_up=1 + break + fi + echo " ssh attempt $i/60: not yet" + sleep 5 +done +if [[ $ssh_up -eq 0 ]]; then + echo "ssh on $PUBLIC_IP never came up; VM left running for diagnosis" >&2 + echo " delete with: gcloud compute instances delete $VM_NAME --project=$GCP_PROJECT --zone=$GCP_ZONE" >&2 + exit 1 +fi + +# === Stage 10.5: mount the persistent disk at /data/yolean === +# The appliance disk doesn't carry GCE guest-tools and our +# prepare_inguest pinned cloud-init to NoCloud only, so we can't +# mount via cloud-init mounts/ or via google-startup-scripts. +# We SSH in and do it directly: +# - format the disk if it has no filesystem (fresh disk) +# - persist the mount via fstab UUID for subsequent reboots +# - mount now +# - restart k3s so it re-discovers /data/yolean (k3s started +# before the mount existed; existing PVs would have mapped +# to empty paths on the root FS until restart) +stage "mounting $GCP_DATADIR_DISK at /data/yolean and restarting k3s" +# shellcheck disable=SC2087 +ssh $SSH_OPTS ystack@"$PUBLIC_IP" 'sudo bash -s' <<'REMOTE' +set -eu +# /dev/disk/by-id/google- requires google-guest-agent, +# which only ships in Google's own GCE images. Our appliance is +# built from the upstream Ubuntu cloud image, so we get the +# kernel-provided SCSI udev path instead: +# /dev/disk/by-id/scsi-0Google_PersistentDisk_ +# `` is what we passed to `gcloud --disk=device-name=datadir`, +# so the path is fully deterministic. We try both shapes -- SCSI +# first (matches the current appliance) and the guest-agent shape +# as a fallback for a future build that does install the agent. +MOUNT=/data/yolean +DEVICE="" +for cand in /dev/disk/by-id/scsi-0Google_PersistentDisk_datadir /dev/disk/by-id/google-datadir; do + for _ in $(seq 1 30); do + if [ -b "$cand" ]; then + DEVICE="$cand" + break 2 + fi + sleep 1 + done +done +[ -n "$DEVICE" ] || { echo "datadir disk never appeared at any expected /dev/disk/by-id/ path" >&2; exit 1; } +echo "datadir: $DEVICE" + +# Format with the label that matches the appliance's pre-baked +# fstab entry (LABEL=y-cluster-data /data/yolean ext4 ...). +# Using a different label, or adding a UUID-based fstab line, +# would either skip the pre-bake mount or duplicate it -- we +# want the LABEL line to be the one that fires at boot. +if ! blkid "$DEVICE" >/dev/null 2>&1; then + mkfs.ext4 -F -L y-cluster-data "$DEVICE" +fi +# Idempotent label enforcement: re-running this script against a +# data disk that was formatted by a PREVIOUS version of the script +# (with a different label, e.g. `data-yolean`) would skip mkfs +# above (blkid finds an existing FS) and leave the wrong label in +# place. The appliance's pre-baked /etc/fstab matches by LABEL, so +# a wrong label means the boot-time mount silently no-ops and the +# seed gate fails. e2label is a no-op when the label is already +# correct, so applying it unconditionally is cheap insurance. +e2label "$DEVICE" y-cluster-data + +install -d -m 0755 "$MOUNT" +if ! mountpoint -q "$MOUNT"; then + mount "$MOUNT" +fi + +# At first boot the seed unit ran before this disk was formatted +# and mounted, so it failed the mount-required gate and k3s.service +# stayed down on its Requires=. Now that /data/yolean is a real +# mountpoint, restart the seed unit so it extracts the seed onto +# the customer's volume, then k3s. +systemctl reset-failed y-cluster-data-seed.service k3s.service +systemctl restart y-cluster-data-seed.service +systemctl restart k3s.service +REMOTE + +probe() { + local what=$1 url=$2 attempts=${3:-60} + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o /dev/null -w " $what HTTP %{http_code}\n" "$url"; then + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 10 + done + return 1 +} + +stage "probing http://$PUBLIC_IP -- whatever you applied locally" +# We don't know the operator's routes a priori; try the +# y-cluster-shipped echo path as a baseline. If their workload +# replaced echo, this fails and the operator curls their own +# route. +# +# We deliberately do NOT enumerate Gateway routes via SSH+kubectl +# here. Post-import scripting that uses maintainer-only paths +# (SSH keys + kubectl access) is a smell: the customer's hosting +# can't replicate it, and tying the build flow to the api-server +# warm-up period creates spurious halts on transient kubectl +# failures (we hit one of those during this script's lifetime -- +# the silent abort right after `systemctl restart k3s.service`). +# Any per-route probing belongs in the consumer-side validate +# script which knows its own routes from its own kustomize. +probe echo "http://$PUBLIC_IP/q/envoy/echo" 30 || \ + echo " (no echo route -- expected if your workload replaced y-cluster echo)" + +# === Stage 11: optional external HTTPS LoadBalancer === +# Operator-driven add-on: if TLS_DOMAINS isn't set in the env, +# prompt for it (skip on empty input). With ASSUME_YES + TLS_DOMAINS +# set, runs without prompting. With ASSUME_YES alone, skip silently +# -- ASSUME_YES is for unattended e2e and we don't want to surprise +# the operator with a billing meter they didn't ask for. +if [[ -z "${TLS_DOMAINS:-}" && -z "${ASSUME_YES:-}" ]]; then + echo + echo "================================================================" + echo "Optional: external HTTPS LoadBalancer (regional, EXTERNAL_MANAGED)" + echo + echo "Sets up a regional GCP External Application Load Balancer in" + echo "front of $VM_NAME with a SELF-SIGNED cert covering the FQDNs" + echo "you specify. Useful for testing the LB+routing chain without" + echo "DNS or a real CA. Browsers will warn on the cert; tools need" + echo "--insecure / -k. Cost: ~hourly forwarding-rule + reserved IP." + echo + echo "HTTPRoutes on the cluster need spec.hostnames covering the" + echo "same FQDNs (the LB forwards Host: unchanged). Patch them" + echo "yourself before answering yes." + echo "================================================================" + read -r -p "FQDNs (comma-separated, empty to skip): " TLS_DOMAINS +fi +if [[ -n "${TLS_DOMAINS:-}" ]]; then + do_tls_frontend "$TLS_DOMAINS" +fi + +# Verify hook: caller-supplied cmd runs after the GCE VM is +# up + optional TLS LB is configured. Receives the full +# Y_CLUSTER_CURRENT_* surface including REMOTE_VM_IP, +# REMOTE_LB_IP (re-queried by current_env), REMOTE_DOMAINS, +# REMOTE_SCHEME -- enough to compose curl --resolve probes +# without /etc/hosts. Non-zero exit aborts; the VM and LB +# stay up for inspection. +if [[ -n "${APPLIANCE_VERIFY_CMD:-}" ]]; then + stage "remote verify (APPLIANCE_VERIFY_CMD)" + current_env + # Same pipefail discipline as APPLIANCE_SEED_CMD. + bash -c "set -o pipefail; $APPLIANCE_VERIFY_CMD" +fi + +if [[ -z "${KEEP_LOCAL:-}" ]]; then + stage "tearing down local cluster (set KEEP_LOCAL=1 to keep it)" + "$Y_CLUSTER" teardown -c "$CFG_DIR" 2>/dev/null || true # y-script-lint:disable=or-true # cleanup best-effort +fi + +cat < /data/yolean (persistent) + Deliverable: $BUNDLE_DIR + ├── gcp-tar/ (uploaded to GCE, used for the + │ live $VM_NAME above) + └── ova/ (hand to a customer for VirtualBox / + VMware -- same disk state) + +Connect: + # One-time per appliance, fetch the kubeconfig onto the laptop. + # (ssh can't stream a remote file to a local path AND give you + # an interactive shell on the same connection -- stdin/stdout + # is owned by the shell -- so this fetch is its own one-shot + # ssh, separate from the interactive one below.) + ssh -i $SSH_KEY ystack@$PUBLIC_IP sudo cat /etc/rancher/k3s/k3s.yaml \\ + > ~/.kube/y-appliance-portforwarded + + # Interactive shell + apiserver tunnel; the tunnel stays up + # until you exit the shell. + ssh -i $SSH_KEY -L 6443:127.0.0.1:6443 ystack@$PUBLIC_IP + +Teardown when done: + $0 teardown + (preserves /data/yolean by default; pass --keep-disk=false to also + delete the persistent disk. Teardown's exit message lists the + disk + recommended delete command for later cleanup.) +================================================================ +EOF diff --git a/scripts/e2e-appliance-export-import.sh b/scripts/e2e-appliance-export-import.sh new file mode 100755 index 0000000..e4b5099 --- /dev/null +++ b/scripts/e2e-appliance-export-import.sh @@ -0,0 +1,319 @@ +#!/usr/bin/env bash +# Round-trip an y-cluster appliance through the export/import contract: +# build with y-cluster, install a placeholder application via kubectl, +# prepare-export, stop, copy the qcow2, then boot a SECOND qemu +# instance against the copy with no y-cluster involvement (simulating +# the customer's IT importing on their hypervisor) and verify the +# application reaches a 200 from a fresh process. +# +# Why this exists: +# The "build a per-customer appliance, ship it, customer boots it" +# pathway has never been e2e-tested. The Hetzner Packer flow proved +# snapshot+clone works on Hetzner; it doesn't tell us whether a +# qcow2 produced locally boots cleanly elsewhere. This script is +# the missing test. +# +# Conventions: +# - The application is opaque to y-cluster. We use the echo +# manifest as a placeholder, but install it via `y-cluster echo +# render | kubectl apply -f -` -- the same shape the eventual +# per-customer install will use (kubectl / kustomize / helm +# against the live cluster). y-cluster has no `echo deploy`-like +# special case here. +# - The customer-side qemu invocation is bare bash. No y-cluster +# binary, no seed image, no cloud-init reattach. Just qemu-system +# against the exported qcow2 with new port forwards. If the +# appliance can't survive that, prepare-export has the bug. +# +# Stages: +# 1. Build the dev binary into ./dist (gitignored). +# 2. Provision an appliance (k3s + Envoy Gateway only) under a +# throwaway name. +# 3. Apply the placeholder app via kubectl. +# 4. Smoketest curl on the build-side host. +# 5. y-cluster stop + prepare-export. +# 6. y-cluster export to a bundle dir (flattened qcow2 + +# keypair + README). +# 7. Boot a fresh qemu against the BUNDLED qcow2 with new +# port forwards. The bundle has no backing-file dependency +# on y-cluster's cloud-image cache; this proves the disk is +# genuinely portable. +# 8. Wait for ssh + curl on the imported instance. +# 9. On failure, ssh in and dump k3s state for diagnosis. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='e2e-appliance-export-import.sh - local round-trip provision -> kubectl install -> prepare-export -> stop -> raw-qemu boot -> verify + +Usage: e2e-appliance-export-import.sh + +Environment: + NAME Appliance name (default: appliance-export-test) + APP_HTTP_PORT Override build-side host port for guest 80 (y-cluster default: 80) + APP_HTTPS_PORT Override build-side host port for guest 443 (y-cluster default: 443) + APP_API_PORT Override build-side host port for guest 6443 (y-cluster default: 6443) + APP_SSH_PORT Override build-side host port for guest 22 (y-cluster default: 2222) + IMP_HTTP_PORT Import-side host port -> guest 80 (default: 39180) + IMP_SSH_PORT Import-side host port -> guest 22 (default: 2230) + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + CACHE_DIR Where y-cluster keeps its qcow2 (default: ~/.cache/y-cluster-qemu) + KEEP_BUILD Set to keep the build-side cluster after success (default: tear it down) + DEBUG Set non-empty for bash trace + +Dependencies: + go, qemu-system-x86_64, kubectl, ssh, ssh-keygen, curl, virt-sysprep (libguestfs-tools) + +Exit codes: + 0 Round-trip succeeded; imported instance answered the smoketest + 1 Any stage failed; build-side cluster left up for diagnosis +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +NAME="${NAME:-appliance-export-test}" +# Import-side host ports: kept hardcoded (not env-overridable + +# defaulted) because the import-side qemu is started directly by +# this script (no y-cluster CLI involvement) and these values +# can't collide with the build-side y-cluster's defaults. +IMP_HTTP_PORT="${IMP_HTTP_PORT:-39180}" +IMP_SSH_PORT="${IMP_SSH_PORT:-2230}" + +REPO_ROOT="$(git rev-parse --show-toplevel)" +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" +EXPORT_DIR=$(mktemp -d -p /tmp e2e-export.XXXXXX) +CFG_DIR=$(mktemp -d -p /tmp e2e-config.XXXXXX) + +stage() { printf '\n=== %s ===\n' "$*"; } + +cleanup() { + set +e + if [[ -f "$EXPORT_DIR/imported.pid" ]]; then + local imp_pid + imp_pid=$(cat "$EXPORT_DIR/imported.pid" 2>/dev/null) + if [[ -n "$imp_pid" ]] && kill -0 "$imp_pid" 2>/dev/null; then + echo "stopping imported qemu (pid $imp_pid)" + kill -TERM "$imp_pid" 2>/dev/null # y-script-lint:disable=or-true # not relevant here + sleep 2 + kill -KILL "$imp_pid" 2>/dev/null # y-script-lint:disable=or-true # may already be gone + fi + fi +} +trap cleanup EXIT + +for tool in go qemu-system-x86_64 kubectl ssh ssh-keygen curl virt-sysprep; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +# virt-sysprep on Ubuntu fails before it touches the qcow2 if it +# can't read /boot/vmlinuz-* (libguestfs builds a tiny appliance VM +# with the host kernel via supermin). Ubuntu installs kernel images +# 0600 root, so non-root invocations bail with an opaque +# "supermin exited with error status 1". Surface the fix here. +if ! [ -r /boot/vmlinuz-"$(uname -r)" ]; then + cat >&2 < $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === 2. Provision the build-side appliance === +# Idempotent re-run: tear down any leftover from a prior failed run. +stage "tearing down any leftover $NAME cluster" +# We need the config in place for teardown to find the cluster, so +# write it BEFORE the teardown attempt. teardown is idempotent +# (no-op when the cluster doesn't exist) so re-entry is safe. +# YAML emission omits any port the operator didn't override, letting +# y-cluster's Go binary apply its own defaults (sshPort=2222, +# portForwards={6443:6443, 80:80, 443:443}). +{ + echo "provider: qemu" + echo "name: $NAME" + echo "context: $NAME" + [ -n "${APP_SSH_PORT:-}" ] && printf 'sshPort: "%s"\n' "$APP_SSH_PORT" + echo 'memory: "4096"' + echo 'cpus: "2"' + echo 'diskSize: "40G"' + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + echo "portForwards:" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" + fi +} > "$CFG_DIR/y-cluster-provision.yaml" + +"$Y_CLUSTER" teardown -c "$CFG_DIR" || true # y-script-lint:disable=or-true # idempotent re-entry: missing cluster is not an error +rm -f "$CACHE_DIR/$NAME".* "$CACHE_DIR/$NAME-"* + +stage "provisioning appliance ($NAME) -- k3s + Envoy Gateway only" +"$Y_CLUSTER" provision -c "$CFG_DIR" + +# === 3. Customer install via kubectl === +# This deliberately uses kubectl, not `y-cluster echo deploy`. The +# pipeline below is exactly the shape the per-customer install path +# will take (render manifests, kubectl apply against the live +# cluster). y-cluster has no special case for the workload here. +stage "installing echo workload (Envoy Gateway + HTTPRoute)" +"$Y_CLUSTER" echo render \ + | kubectl --context="$NAME" apply --server-side --field-manager=customer-install -f - +kubectl --context="$NAME" -n y-cluster wait \ + --for=condition=Available deployment/echo --timeout=180s + +# Stateful workload: VersityGW (S3-over-posix gateway) backed by a +# 1Gi local-path PVC. Tests the persistence path that the simpler +# echo workload skips. +stage "installing VersityGW StatefulSet via yconverge" +"$Y_CLUSTER" yconverge --context="$NAME" \ + -k "$REPO_ROOT/testdata/appliance-stateful/base" + +# === 4. Build-side smoketest === +stage "build-side smoketest: echo + s3" +probe() { + local what=$1 url=$2 attempts=${3:-30} + local out + out=$(mktemp) + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o "$out" -w " $what HTTP %{http_code}\n" "$url"; then + rm -f "$out" + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 5 + done + echo "$what smoketest never succeeded; aborting" >&2 + rm -f "$out" + return 1 +} +probe echo "http://127.0.0.1:${APP_HTTP_PORT:-80}/q/envoy/echo" +probe s3 "http://127.0.0.1:${APP_HTTP_PORT:-80}/s3/health" + +# === 5. stop + prepare-export === +# y-cluster stop owns the graceful guest shutdown (ssh +# poweroff -> wait for qemu exit -> SIGTERM/SIGKILL fallback). +# Without that, qemu's SIGTERM exits in ~200ms and the guest's +# k3s/containerd state isn't flushed, leaving zero-byte +# overlayfs snapshot files on the qcow2 and "exec format error" +# crash loops on the imported boot. +stage "stopping cluster ($NAME)" +"$Y_CLUSTER" stop --context="$NAME" + +stage "prepare-export ($NAME)" +"$Y_CLUSTER" prepare-export --context="$NAME" + +# === 6. y-cluster export -> bundle dir === +# Produces a flattened, self-contained qcow2 (no backing file) +# plus the keypair plus a README. EXPORT_DIR was created by +# mktemp; the export subcommand refuses to write into a +# non-empty dir, so remove that dir and re-create it after the +# export. +BUNDLE_DIR="$EXPORT_DIR/bundle" +stage "exporting bundle to $BUNDLE_DIR (--format=qcow2)" +"$Y_CLUSTER" export --context="$NAME" --format=qcow2 "$BUNDLE_DIR" +ls -la "$BUNDLE_DIR/" +echo " qemu-img info on the bundled disk:" +qemu-img info "$BUNDLE_DIR/$NAME.qcow2" | grep -E '^(file format|virtual size|disk size|backing)' | sed 's/^/ /' + +# === 7. Customer-side: raw qemu against the bundled disk === +# No y-cluster involvement here -- just qemu-system-x86_64 +# pointed at the bundled qcow2 + the bundled key. This proves +# the bundle is genuinely self-contained: any host that can run +# qemu (with the cloud image NOT present at the build path) +# would boot it. +stage "booting bundled qcow2 via raw qemu (host ports $IMP_SSH_PORT -> :22, $IMP_HTTP_PORT -> :80)" +qemu-system-x86_64 \ + -name "$NAME-imported" \ + -machine accel=kvm -cpu host \ + -smp 2 -m 4096 \ + -drive "file=$BUNDLE_DIR/$NAME.qcow2,format=qcow2,if=virtio" \ + -netdev "user,id=n0,hostfwd=tcp::$IMP_SSH_PORT-:22,hostfwd=tcp::$IMP_HTTP_PORT-:80" \ + -device virtio-net-pci,netdev=n0 \ + -serial "file:$EXPORT_DIR/console.log" \ + -display none \ + -daemonize \ + -pidfile "$EXPORT_DIR/imported.pid" +echo " imported pid: $(cat "$EXPORT_DIR/imported.pid")" + +# === 8. Wait for SSH === +SSH_OPTS="-i $BUNDLE_DIR/$NAME-ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" +echo " waiting for ssh" +ssh_up=0 +for i in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS -p "$IMP_SSH_PORT" ystack@127.0.0.1 'true' 2>/dev/null; then + ssh_up=1 + echo " ssh up after $i tries" + break + fi + sleep 5 +done +if [[ $ssh_up -eq 0 ]]; then + echo "imported instance ssh never came up; console log:" >&2 + tail -50 "$EXPORT_DIR/console.log" >&2 + exit 1 +fi + +# === 9. Imported smoketest === +# Both endpoints must come back: echo (stateless) proves the +# Envoy Gateway data plane is up, /s3/health (StatefulSet against +# the local-path PV that lives on the appliance disk) proves the +# stateful workload survived the export -> bundle -> raw-qemu boot. +stage "imported-side smoketest: echo + s3" +imp_probe() { + local what=$1 url=$2 attempts=${3:-60} + local out + out=$(mktemp) + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o "$out" -w " $what HTTP %{http_code}\n" "$url"; then + echo + echo "=== imported $what response (head) ===" + head -25 "$out" + echo + rm -f "$out" + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 5 + done + rm -f "$out" + return 1 +} +if imp_probe echo "http://127.0.0.1:$IMP_HTTP_PORT/q/envoy/echo" \ + && imp_probe s3 "http://127.0.0.1:$IMP_HTTP_PORT/s3/health"; then + echo "=== success: round-trip works (echo + s3) ===" + echo " imported echo reachable at: http://127.0.0.1:$IMP_HTTP_PORT/q/envoy/echo" + echo " imported s3 reachable at: http://127.0.0.1:$IMP_HTTP_PORT/s3/health" + echo " imported ssh: ssh -p $IMP_SSH_PORT -i $BUNDLE_DIR/$NAME-ssh ystack@127.0.0.1" + echo " build-side cluster preserved (KEEP_BUILD=1) -- destroy with: $Y_CLUSTER teardown -c $CFG_DIR" + if [[ -z "${KEEP_BUILD:-}" ]]; then + "$Y_CLUSTER" teardown -c "$CFG_DIR" 2>/dev/null # y-script-lint:disable=or-true # success path cleanup + fi + exit 0 +fi + +# === Diagnosis on failure === +echo >&2 +echo "imported smoketest never returned 200. Diagnostics:" >&2 +# shellcheck disable=SC2086 +ssh $SSH_OPTS -p "$IMP_SSH_PORT" ystack@127.0.0.1 \ + 'echo ===nodes===; sudo k3s kubectl get nodes -o wide; + echo ===pods===; sudo k3s kubectl get pods -A; + echo ===k3s status===; systemctl is-active k3s; + echo ===listen===; sudo ss -tlnp | grep -E ":(80|443|6443)\b" + ' >&2 # y-script-lint:disable=or-true # diagnostic best-effort +echo " imported ssh: ssh -p $IMP_SSH_PORT -i $BUNDLE_DIR/$NAME-ssh ystack@127.0.0.1" >&2 +echo " console log: $EXPORT_DIR/console.log" >&2 +exit 1 diff --git a/scripts/e2e-appliance-hetzner.pkr.hcl b/scripts/e2e-appliance-hetzner.pkr.hcl new file mode 100644 index 0000000..d65181e --- /dev/null +++ b/scripts/e2e-appliance-hetzner.pkr.hcl @@ -0,0 +1,252 @@ +# Packer template that bakes a y-cluster appliance directly on +# Hetzner Cloud and saves it as a snapshot. Replaces the older +# dd-via-rescue path (qemu-img convert + zstd + dd /dev/sda from +# the rescue image) which broke at the "TCP/22 reachable, no SSH +# banner" stage we couldn't diagnose without a console. +# +# Why Packer + hcloud builder: +# - Hetzner's supported custom-image path is snapshots, not +# uploaded raw images. Building on Hetzner avoids the BIOS / +# partition table / network-driver mismatch you hit when you +# dd a qemu disk onto bare metal. +# - Packer's hcloud builder owns the lifecycle: spin a temporary +# server from a stock Ubuntu image, run provisioners over SSH, +# power off, snapshot, delete the temporary server. +# - The output (snapshot ID + name) feeds straight into +# `hcloud server create --image=` for fleet rollout. +# +# Local appliance vs Hetzner appliance: +# - Local dev still uses `y-cluster provision` against qemu and +# prepare-export when the operator wants a portable qcow2. +# - Production / customer Hetzner deploys go through this Packer +# template instead. +# - Both share the workload manifests (pkg/echo/template.yaml and +# the upstream Envoy Gateway install) by re-running the same +# `y-cluster echo deploy` invocation; only the VM lifecycle +# diverges. +# +# Required: HCLOUD_TOKEN in env, var.y_cluster_binary set to a +# linux/amd64 y-cluster build. The orchestrator script +# (e2e-appliance-hetzner.sh) supplies both. + +packer { + required_plugins { + hcloud = { + source = "github.com/hetznercloud/hcloud" + version = ">= 1.6" + } + } +} + +variable "hcloud_token" { + type = string + default = "${env("HCLOUD_TOKEN")}" + sensitive = true +} + +variable "snapshot_name" { + type = string + default = "y-cluster-appliance-{{timestamp}}" +} + +# cx23 = 2 vCPU / 4 GB RAM / 40 GB disk in hel1, ~€0.006/h. +# Hetzner retired cx22 / cpx21 in EU regions during 2026; the +# x86 shared lineup is now cx*3 / cpx*2 and cax* (Ampere arm). +variable "server_type" { + type = string + default = "cx23" +} + +variable "location" { + type = string + default = "hel1" +} + +variable "base_image" { + type = string + default = "ubuntu-24.04" +} + +variable "k3s_version" { + type = string + default = "v1.35.4+k3s1" +} + +# Tracks pkg/provision/envoygateway/version.go's Version constant. +# Kept independent here so `packer build` can be run against an +# older binary if needed; the orchestrator script does NOT pin +# them together to keep that flexibility. +variable "envoy_gateway_version" { + type = string + default = "v1.7.2" +} + +variable "y_cluster_binary" { + type = string + description = "Path to a linux/amd64 y-cluster binary to upload onto the build host" +} + +variable "prepare_script" { + type = string + description = "Path to pkg/provision/qemu/prepare_inguest.sh -- the shared identity-reset script that also runs against offline qcow2 disks via virt-customize" +} + +# Stable k3s node-name baked into the appliance. The build host's +# hostname is whatever Packer assigns (e.g. packer-XXXXXXXX); the +# customer's cloned server will end up with a different hostname +# (Hetzner sets it from the server name on first boot). Pinning +# K3S_NODE_NAME decouples k3s identity from the OS hostname, so +# the cloned server's k3s recognises the node entry baked into +# the snapshot's sqlite datastore. Without this pin, every cloned +# server registers a NEW node under its own hostname while the +# build-host node lingers as orphan, and every workload pod stays +# bound to the dead node. +variable "k3s_node_name" { + type = string + default = "appliance" +} + +variable "stateful_manifest" { + type = string + description = "Path to a pre-rendered single-file YAML for the appliance-stateful workload. Packer's file provisioner doesn't recursively upload directories cleanly across all builders, so the orchestrator script `kubectl kustomize`s testdata/appliance-stateful/base into a temp file and passes the path here." +} + +variable "localstorage_manifest" { + type = string + description = "Path to a pre-rendered local-path-provisioner manifest (output of `y-cluster localstorage render`). Same shape as stateful_manifest -- a host-rendered single yaml, applied via kubectl on the build VM." +} + +source "hcloud" "appliance" { + token = var.hcloud_token + image = var.base_image + location = var.location + server_type = var.server_type + ssh_username = "root" + snapshot_name = var.snapshot_name + snapshot_labels = { + purpose = "y-cluster-appliance" + } +} + +build { + sources = ["source.hcloud.appliance"] + + # Stage the y-cluster binary on the build host. Used here for + # `y-cluster echo deploy`; left on the appliance as a no-cost + # operator-inspection convenience. + provisioner "file" { + source = var.y_cluster_binary + destination = "/usr/local/bin/y-cluster" + } + + # Stage the shared identity-reset script. Same script runs on + # the qemu prepare-export path via virt-customize. Single + # source of truth for what the appliance disk looks like at + # snapshot time. + provisioner "file" { + source = var.prepare_script + destination = "/usr/local/bin/y-cluster-prepare" + } + + # Stage the stateful-workload manifest (VersityGW + # StatefulSet + Service + HTTPRoute + 1Gi local-path PVC). + # The file is a single rendered YAML produced by the + # orchestrator's `kubectl kustomize`, so this is a plain + # one-file scp -- no recursive directory upload, no Packer + # SSH-communicator quirks. + provisioner "file" { + source = var.stateful_manifest + destination = "/root/appliance-stateful.yaml" + } + + # Stage the bundled local-path-provisioner manifest + # (rendered by `y-cluster localstorage render` on the host). + # Replaces k3s's disabled local-storage addon with the + # appliance-shape defaults: path /data/yolean, predictable + # PVC namespace_name pattern, Retain reclaim. + provisioner "file" { + source = var.localstorage_manifest + destination = "/root/y-cluster-localstorage.yaml" + } + + # k3s install + workload + smoketest, all running normally. + # We run k3s during the build (no INSTALL_K3S_SKIP_START) so + # the snapshot includes a fully-converged cluster: kubeconfig, + # sqlite-resident workload state, pulled container images, + # everything. The cloned server's k3s recognises the node + # entry by K3S_NODE_NAME (baked in via /etc/systemd/system/ + # k3s.service.env) and resumes -- no orphan node, no first-boot + # manifests-dir reconcile loop, faster startup. + provisioner "shell" { + inline_shebang = "/bin/bash -eux" + environment_vars = [ + "K3S_VERSION=${var.k3s_version}", + "K3S_NODE_NAME=${var.k3s_node_name}", + "ENVOY_GATEWAY_VERSION=${var.envoy_gateway_version}", + "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", + ] + inline = [ + "cloud-init status --wait", + "chmod +x /usr/local/bin/y-cluster /usr/local/bin/y-cluster-prepare", + # Install + start. K3S_NODE_NAME comes from the + # environment_vars block above; the install script writes + # it into /etc/systemd/system/k3s.service.env so the + # cloned server's systemd-managed k3s reads it back on + # cold boot. + # --disable=local-storage: y-cluster ships its own + # local-path-provisioner via the y-cluster-localstorage.yaml + # applied below; k3s's bundled local-storage would otherwise + # reconcile our ConfigMap back to the upstream defaults. + "curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=\"$K3S_VERSION\" INSTALL_K3S_EXEC='--disable=traefik --disable=local-storage' sh -", + "until kubectl get nodes 2>/dev/null | grep -qE ' Ready '; do sleep 2; done", + # Bundled local-path-provisioner with appliance-shape + # defaults (path /data/yolean, predictable PVC + # namespace_name pattern, Retain reclaim). + "kubectl apply --server-side --field-manager=y-cluster -f /root/y-cluster-localstorage.yaml", + "kubectl --namespace=local-path-storage rollout status deployment/local-path-provisioner --timeout=120s", + # Envoy Gateway upstream install + the y-cluster GatewayClass. + "kubectl apply --server-side -f https://github.com/envoyproxy/gateway/releases/download/$ENVOY_GATEWAY_VERSION/install.yaml", + "kubectl wait --namespace=envoy-gateway-system --for=condition=Available deployments --all --timeout=180s", + "kubectl apply --server-side -f - <<'EOF'\napiVersion: gateway.networking.k8s.io/v1\nkind: GatewayClass\nmetadata:\n name: y-cluster\nspec:\n controllerName: gateway.envoyproxy.io/gatewayclass-controller\nEOF", + # Echo workload via the standard kubectl path -- y-cluster + # has no special case for the customer's app. + "/usr/local/bin/y-cluster echo deploy --context default", + "kubectl --namespace=y-cluster wait --for=condition=Available deployment/echo --timeout=120s", + # Stateful workload: VersityGW (S3-over-posix gateway) + # backed by a local-path PVC. Brings up the persistent- + # volume code path so the snapshot includes a + # provisioned PV directory under /var/lib/rancher/k3s/ + # storage, with the StatefulSet bound to it. Cloned + # servers' k3s recognises the same node-name (appliance) + # and rebinds the same PV directory -- no orphan, no + # re-provision. + "kubectl apply --server-side --field-manager=appliance-build -f /root/appliance-stateful.yaml", + "kubectl --namespace=appliance-stateful rollout status statefulset/versitygw --timeout=180s", + # In-VM smoketest: klipper-lb (k3s's bundled LoadBalancer + # controller) binds host port 80 on the node. Probe both + # the echo path and the s3 path so a build with a broken + # PVC, missing storage class, or mis-routed HTTPRoute + # fails at build time. + "for i in $(seq 1 60); do curl -fsS http://localhost/q/envoy/echo && break; sleep 2; done", + "for i in $(seq 1 60); do curl -fsS http://localhost/s3/health && break; sleep 2; done", + ] + } + + # Identity reset via the shared script. Runs in the live VM + # against /etc/cloud/cloud.cfg.d/, /etc/netplan/, log files, + # bash history, etc. Same script the qemu prepare-export + # runs offline; one source of truth. + # + # After the script, stop k3s gracefully so the snapshot + # captures a quiesced sqlite datastore. Packer's hcloud + # builder powers the VM off and snapshots after this + # provisioner returns. + provisioner "shell" { + inline_shebang = "/bin/bash -eux" + inline = [ + "/usr/local/bin/y-cluster-prepare", + "systemctl stop k3s", + "sync", + ] + } +} diff --git a/scripts/e2e-appliance-hetzner.sh b/scripts/e2e-appliance-hetzner.sh new file mode 100755 index 0000000..9fbc137 --- /dev/null +++ b/scripts/e2e-appliance-hetzner.sh @@ -0,0 +1,306 @@ +#!/usr/bin/env bash +# e2e: build a y-cluster appliance snapshot on Hetzner Cloud via +# Packer, boot a server from it, and verify the echo HTTPRoute +# answers over the public IP. +# +# Replaces the older dd-via-rescue path (qemu-img convert + zstd + +# dd /dev/sda from rescue mode) which broke at the "TCP/22 reachable, +# no SSH banner" stage we couldn't diagnose without out-of-band +# console. Packer's hcloud builder handles base-image / partition +# layout / network drivers natively, so the path "image boots on +# Hetzner" is no longer something we have to engineer ourselves -- +# we get it for free by building on Hetzner from the start. +# +# Local appliance vs Hetzner appliance: +# - Local dev still uses `y-cluster provision` against qemu and +# prepare-export when the operator wants a portable qcow2. +# - Production Hetzner deploys go through this script, which +# produces a reusable snapshot a fleet can clone from. +# +# Stages: +# 1. Build a current-arch y-cluster dev binary into ./dist (the +# Packer template uploads it onto the build host). +# 2. `packer init` + `packer build` of e2e-appliance-hetzner.pkr.hcl. +# Packer creates a temporary cx23 in hel1, runs the workload +# install, snapshots, and tears the temporary server down. +# 3. Resolve the snapshot ID from `hcloud image list`. +# 4. Create a fresh server from the snapshot (idempotent: deletes +# any matching $SERVER_NAME first). +# 5. Probe http:///q/envoy/echo until it answers. +# +# Prerequisites: +# - HCLOUD_TOKEN sourced from $ENV_FILE (set in .env or shell env) +# - hcloud CLI on PATH (apt install hcloud OR snap install hcloud) +# - packer on PATH (apt install packer after adding HashiCorp's +# repo, OR download from releases.hashicorp.com) +# - go (to build the dev binary), curl, ssh-keygen +# +# On success: prints the public IP and leaves the server running so +# the operator can poke at it. Teardown is manual: +# hcloud server delete $SERVER_NAME +# hcloud image delete # optional: snapshot is reusable +# The script is idempotent on re-run -- it deletes any matching +# server/key first and starts fresh from a new snapshot. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='e2e-appliance-hetzner.sh - Build a y-cluster appliance snapshot on Hetzner Cloud and verify it serves traffic + +Usage: e2e-appliance-hetzner.sh + +Environment: + HCLOUD_TOKEN Hetzner Cloud API token (sourced from ENV_FILE) + ENV_FILE Path to env file with HCLOUD_TOKEN (set in .env or shell env; required) + SERVER_NAME Server name to create (default: y-cluster-appliance-test) + SERVER_TYPE Hetzner server type (default: cx23) + SERVER_LOCATION Hetzner location (default: hel1) + SNAPSHOT_NAME Snapshot description used as Packer output name + Y_CLUSTER Path to dev binary (default: ./dist/y-cluster) + DEBUG Set non-empty to enable bash trace + +Dependencies: + packer, hcloud, go, ssh, ssh-keygen, curl + +Exit codes: + 0 Success: appliance reachable on public IP + 1 Missing prereq, packer build failure, or echo never answered +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${ENV_FILE:?set ENV_FILE in .env or shell env}" + +if [[ ! -f "$ENV_FILE" ]]; then + echo "missing env file: $ENV_FILE" >&2 + echo "expected at minimum: HCLOUD_TOKEN=" >&2 + exit 1 +fi +# shellcheck disable=SC1090 +source "$ENV_FILE" +[[ -n "${HCLOUD_TOKEN:-}" ]] || { echo "HCLOUD_TOKEN not set in $ENV_FILE" >&2; exit 1; } +export HCLOUD_TOKEN + +# Tunables. Defaults match the Packer template's; override here when +# experimenting with alternate locations / instance types. +SERVER_NAME="${SERVER_NAME:-y-cluster-appliance-test}" +SERVER_TYPE="${SERVER_TYPE:-cx23}" +SERVER_LOCATION="${SERVER_LOCATION:-hel1}" +SNAPSHOT_NAME="${SNAPSHOT_NAME:-y-cluster-appliance-$(date -u +%Y%m%d-%H%M%S)}" + +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +PACKER_TEMPLATE="$REPO_ROOT/scripts/e2e-appliance-hetzner.pkr.hcl" + +for tool in packer hcloud go ssh ssh-keygen curl; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +stage() { printf '\n=== %s ===\n' "$*"; } + +confirm() { + local prompt=$1 + if [[ -n "${ASSUME_YES:-}" ]]; then + echo "ASSUME_YES set; proceeding ($prompt)" + return 0 + fi + read -r -p "$prompt [y/N] " answer + case "${answer,,}" in + y|yes) return 0 ;; + *) return 1 ;; + esac +} + +cat <<'WARN' + +================================================================ +DEPRECATION WARNING + +scripts/e2e-appliance-hetzner.sh's role is changing. + +Today this script tests the legacy "Hetzner-as-export-mode" +shape: build an appliance inside a Hetzner VM via Packer, +snapshot, boot a server from the snapshot. This shape doesn't +match the appliance contract (Hetzner Cloud has no public API +for uploading a locally-built disk). + +Replacement plan: + - This script will be REPURPOSED once a Hetzner PROVISIONER + exists in pkg/provision/hetzner/ (alongside qemu / docker / + multipass). Repurposed scope: end-to-end test of + `y-cluster provision -c hetzner.yaml` -> snapshot -> + instantiate-from-snapshot. The Packer-build half goes away; + the snapshot becomes a regular y-cluster lifecycle artefact. + - The local-build appliance contract is moving to + scripts/appliance-qemu-to-gcp.sh (Hetzner's API can't + accept a local disk; GCP's `gcloud compute images import` + can). + +This script still runs. It still passes. But its purpose is +about to flip; treat results from a green run today as +"Packer build still works" rather than "appliance contract +verified". +================================================================ + +WARN +confirm "Proceed with the legacy Packer e2e anyway?" \ + || { echo "aborted; no changes made."; exit 0; } + +# === 1. Build the dev binary the Packer template uploads === +stage "building linux/amd64 dev binary -> $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && GOOS=linux GOARCH=amd64 go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === 2. render stateful manifest + packer init + build === +# Packer's file provisioner doesn't recursively upload +# directories cleanly across all builder/communicator +# combinations (hcloud's SSH communicator scp's a directory +# argument as a single path and gets back "Is a directory"). +# Pre-render the kustomize base on the host into one yaml file +# and ship that single file to the build VM instead. Same end +# result, no scp recursion concerns. +# The fixture is split into two yconverge modules (namespace +# first, then the StatefulSet+Service+HTTPRoute) so the local +# convergence path can express the dep with a cue import. The +# Hetzner Packer flow doesn't run yconverge inside the build +# VM (would need the y-cluster binary on the VM) -- it stays +# kubectl-apply, but we render BOTH bases and concat. kubectl +# applies a Namespace ahead of namespaced resources in the +# same -f input, so a single concat'd file converges in the +# right order. +STATEFUL_MANIFEST=$(mktemp -t appliance-stateful.XXXXXX.yaml) +{ + kubectl kustomize "$REPO_ROOT/testdata/appliance-stateful/namespace" + echo '---' + kubectl kustomize "$REPO_ROOT/testdata/appliance-stateful/base" +} > "$STATEFUL_MANIFEST" + +# y-cluster's bundled local-path-provisioner manifest (replaces +# k3s's disabled local-storage). Rendered with the same defaults +# the Go-side provisioners install so an appliance built via +# Hetzner Packer ends up indistinguishable from one built locally. +LOCALSTORAGE_MANIFEST=$(mktemp -t y-cluster-localstorage.XXXXXX.yaml) +"$Y_CLUSTER" localstorage render > "$LOCALSTORAGE_MANIFEST" + +stage "packer init" +packer init "$PACKER_TEMPLATE" + +stage "packer build (creates a temporary $SERVER_TYPE in $SERVER_LOCATION, snapshots, deletes)" +packer build \ + -var "snapshot_name=$SNAPSHOT_NAME" \ + -var "server_type=$SERVER_TYPE" \ + -var "location=$SERVER_LOCATION" \ + -var "y_cluster_binary=$Y_CLUSTER" \ + -var "prepare_script=$REPO_ROOT/pkg/provision/qemu/prepare_inguest.sh" \ + -var "stateful_manifest=$STATEFUL_MANIFEST" \ + -var "localstorage_manifest=$LOCALSTORAGE_MANIFEST" \ + "$PACKER_TEMPLATE" + +# === 3. Resolve snapshot ID === +# Packer's hcloud builder prints the snapshot ID at the end of build +# but doesn't expose it in a stable machine-readable way without a +# manifest post-processor. hcloud image list is the simpler path. +stage "resolving snapshot id for $SNAPSHOT_NAME" +SNAPSHOT_ID=$(hcloud image list \ + --type=snapshot \ + --selector="purpose=y-cluster-appliance" \ + --output=json \ + | python3 -c " +import json, sys +images = json.load(sys.stdin) +matches = [i for i in images if i.get('description') == '$SNAPSHOT_NAME'] +if not matches: + sys.exit('no snapshot named $SNAPSHOT_NAME found') +print(matches[0]['id']) +") +echo " snapshot id: $SNAPSHOT_ID" + +# === 4. Create a fresh ssh keypair + server from the snapshot === +KEY_DIR=$(mktemp -d) +trap 'rm -rf "$KEY_DIR" "$STATEFUL_MANIFEST" "$LOCALSTORAGE_MANIFEST"' EXIT +ssh-keygen -t ed25519 -N '' -C "$SERVER_NAME-$$" -f "$KEY_DIR/id" -q +KEY_NAME="$SERVER_NAME" + +stage "tearing down any leftover server / key from a prior run" +hcloud server delete "$SERVER_NAME" 2>/dev/null || true # y-script-lint:disable=or-true # idempotent cleanup: missing server is not an error +hcloud ssh-key delete "$KEY_NAME" 2>/dev/null || true # y-script-lint:disable=or-true # idempotent cleanup: missing key is not an error + +stage "registering ssh public key as $KEY_NAME" +hcloud ssh-key create --name "$KEY_NAME" --public-key-from-file "$KEY_DIR/id.pub" >/dev/null + +stage "creating $SERVER_NAME from snapshot $SNAPSHOT_ID" +hcloud server create \ + --name "$SERVER_NAME" \ + --type "$SERVER_TYPE" \ + --image "$SNAPSHOT_ID" \ + --location "$SERVER_LOCATION" \ + --ssh-key "$KEY_NAME" \ + >/dev/null +PUBLIC_IP=$(hcloud server ip "$SERVER_NAME") +echo " public ip: $PUBLIC_IP" + +# === 5. Wait for sshd, then probe the echo HTTPRoute === +SSH_OPTS="-i $KEY_DIR/id -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" +echo " waiting for ssh on $PUBLIC_IP:22" +for _ in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS root@"$PUBLIC_IP" 'true' 2>/dev/null; then + break + fi + sleep 5 +done + +# Cold boot from snapshot: cloud-init runs (~30s), k3s.service starts +# for the first time, the envoy gateway controller comes up, the +# envoy proxy data plane comes up, the VersityGW StatefulSet +# rebinds its PV, klipper-lb binds :80. The probe loop is long +# enough to cover the whole chain on a fresh cx23. +probe() { + local what=$1 url=$2 attempts=${3:-60} + local out + out=$(mktemp) + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o "$out" -w " $what HTTP %{http_code}\n" "$url"; then + echo + echo "=== $what response (head) ===" + head -25 "$out" + echo + rm -f "$out" + return 0 + fi + echo " $what attempt $i/$attempts: no answer yet" + sleep 10 + done + rm -f "$out" + return 1 +} + +stage "probing http://$PUBLIC_IP -- echo + s3" +if probe echo "http://$PUBLIC_IP/q/envoy/echo" \ + && probe s3 "http://$PUBLIC_IP/s3/health"; then + echo "=== success: cloned server serves echo + s3 ===" + echo " echo: http://$PUBLIC_IP/q/envoy/echo" + echo " s3: http://$PUBLIC_IP/s3/health" + echo " ssh: ssh -i $KEY_DIR/id root@$PUBLIC_IP" + echo " destroy: hcloud server delete $SERVER_NAME" + echo " snapshot ($SNAPSHOT_ID) preserved -- reuse with: hcloud server create --image=$SNAPSHOT_ID ..." + exit 0 +fi + +echo >&2 +echo "echo never answered within $((ATTEMPTS * 10))s. server still up for diagnosis:" >&2 +# shellcheck disable=SC2086 +ssh $SSH_OPTS root@"$PUBLIC_IP" 'systemctl is-active k3s; kubectl get pods -A 2>&1 | head -30' >&2 \ + || true # y-script-lint:disable=or-true # diagnostic best-effort -- main failure already exits 1 +echo " ssh: ssh -i $KEY_DIR/id root@$PUBLIC_IP" >&2 +echo " destroy: hcloud server delete $SERVER_NAME" >&2 +exit 1 diff --git a/scripts/e2e-appliance-qemu-to-gcp.sh b/scripts/e2e-appliance-qemu-to-gcp.sh new file mode 100755 index 0000000..073d933 --- /dev/null +++ b/scripts/e2e-appliance-qemu-to-gcp.sh @@ -0,0 +1,392 @@ +#!/usr/bin/env bash +# e2e: complete qemu-to-GCP appliance workflow, non-interactive. +# +# This is the canonical SRE example for the appliance contract: +# the disk we verify locally with qemu IS the disk that boots in +# Google Compute Engine. No re-build on the cloud side; the GCS +# tarball is exactly what `y-cluster export --format=gcp-tar` +# produced from the local provision. +# +# The workflow this script documents -- in order -- is what an +# SRE follows by hand when they want to ship a customer +# appliance to GCP: +# +# 1. Bootstrap a GCP service account in the QA project (one +# time per project; output is a JSON key the rest of the +# flow consumes via GOOGLE_APPLICATION_CREDENTIALS). +# scripts/gcp-bootstrap-credentials.sh +# +# 2. Provision a y-cluster appliance locally on qemu. This +# gives the same k3s + Envoy Gateway + bundled local-path +# stack the customer will run. +# y-cluster provision -c +# +# 3. Install the customer's workload(s). The e2e here uses +# the y-cluster echo workload + the appliance-stateful +# VersityGW StatefulSet as stand-ins; in real customer +# flows this is whatever kubectl apply / yconverge / helm +# the customer specifies. The Hetzner Object Storage +# tutorial uses VersityGW; the principle is the same. +# y-cluster echo render | kubectl apply -f - +# y-cluster yconverge -k testdata/appliance-stateful/base +# +# 4. Smoketest from the host. Anything that's reachable on +# :80 of the local qemu's port-forward is reachable on +# :80 of the eventual GCE VM. +# curl http://127.0.0.1:80/q/envoy/echo +# +# 5. Stop the cluster cleanly so the qcow2 is quiesced. The +# graceful-stop logic flushes containerd snapshot state. +# y-cluster stop --context=$NAME +# +# 6. prepare-export: virt-customize-driven identity reset +# (machine-id retained, ssh host keys retained, cloud-init +# cleaned, netplan generic-NIC match installed, +# systemd-timesyncd enabled). This is the step that makes +# the disk portable. +# y-cluster prepare-export --context=$NAME +# +# 7. Export to GCE custom-image format. Produces +# /.tar.gz containing exactly disk.raw. +# y-cluster export --context=$NAME --format=gcp-tar +# +# 8. Upload to GCS. Bucket created on first run with +# uniform-access mode. +# gcloud storage cp /.tar.gz \ +# gs://-appliance-images/.tar.gz +# +# 9. Create custom image from the GCS object. Direct create +# (no managed conversion job). +# gcloud compute images create \ +# --source-uri=gs://-appliance-images/.tar.gz +# +# 10. Ensure firewall opens public ports. Idempotent. +# gcloud compute firewall-rules create y-cluster-appliance-public ... +# +# 11. Create VM from the image, tagged for the firewall rule. +# gcloud compute instances create \ +# --image= --tags=y-cluster-appliance ... +# +# 12. Wait for ssh + probe HTTP. The disk we just built is the +# disk now booting; if smoketest passes here, it's the same +# smoketest that passed locally. +# +# 13. Teardown: delete the VM, the image, the GCS object, the +# local cluster. The e2e is the thing that proves the +# contract; we don't leave artefacts behind. +# +# Re-run safety: every step is idempotent. Running this twice +# in a row produces the same result; partial-failure re-runs +# pick up where the previous left off (fresh teardown of any +# leftover server / image / cluster on entry). +# +# This script is the proof. The interactive variant is +# scripts/appliance-qemu-to-gcp.sh -- same flow but with +# operator prompts at the export and GCP-write boundaries. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='e2e-appliance-qemu-to-gcp.sh - canonical SRE workflow: provision -> install -> verify -> prepare-export -> export gcp-tar -> upload -> image -> instance -> probe -> teardown + +Usage: e2e-appliance-qemu-to-gcp.sh + +Environment: + GCP_PROJECT GCP project (set in .env or shell env; required) + GCP_REGION GCP region (default: europe-north2) + GCP_ZONE GCP zone (default: europe-north2-a) + GCP_BUCKET GCS bucket (default: -appliance-images) + GCP_MACHINE_TYPE Machine type (default: e2-medium) + GCP_KEY Service account JSON (set in .env or shell env; required) + NAME Cluster + VM name (default: appliance-gcp-e2e) + KEEP Set to skip teardown for diagnosis (default: tear down on success) + DEBUG Set non-empty for bash trace + +Dependencies: + go, qemu-system-x86_64, qemu-img, kubectl, ssh, ssh-keygen, curl, + virt-sysprep, gcloud +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${GCP_PROJECT:?set GCP_PROJECT in .env or shell env}" +: "${GCP_KEY:?set GCP_KEY in .env or shell env}" + +GCP_REGION="${GCP_REGION:-europe-north2}" +GCP_ZONE="${GCP_ZONE:-europe-north2-a}" +GCP_BUCKET="${GCP_BUCKET:-${GCP_PROJECT}-appliance-images}" +GCP_MACHINE_TYPE="${GCP_MACHINE_TYPE:-e2-medium}" + +NAME="${NAME:-appliance-gcp-e2e}" + +Y_CLUSTER="${Y_CLUSTER:-$REPO_ROOT/dist/y-cluster}" +CACHE_DIR="${CACHE_DIR:-$HOME/.cache/y-cluster-qemu}" +CFG_DIR="$HOME/.cache/y-cluster-appliance-build/$NAME" +BUNDLE_DIR=$(mktemp -d -p "$REPO_ROOT/dist" "appliance-gcp-e2e.XXXXXX" 2>/dev/null \ + || mktemp -d -p /tmp "appliance-gcp-e2e.XXXXXX") + +IMAGE_NAME="$NAME-$(date -u +%Y%m%d-%H%M%S)" +VM_NAME="$NAME" + +stage() { printf '\n=== %s ===\n' "$*"; } + +teardown() { + set +e + if [[ -n "${KEEP:-}" ]]; then + echo + echo "KEEP set; preserving artefacts for diagnosis:" + echo " local cluster: $Y_CLUSTER teardown -c $CFG_DIR" + echo " GCE VM: gcloud compute instances delete $VM_NAME --project=$GCP_PROJECT --zone=$GCP_ZONE" + echo " GCE image: gcloud compute images delete $IMAGE_NAME --project=$GCP_PROJECT" + echo " GCS object: gcloud storage rm gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz --project=$GCP_PROJECT" + echo " bundle: $BUNDLE_DIR" + return + fi + stage "teardown" + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet 2>/dev/null # y-script-lint:disable=or-true # idempotent cleanup: missing VM is not an error + gcloud compute images delete "$IMAGE_NAME" \ + --project="$GCP_PROJECT" --quiet 2>/dev/null # y-script-lint:disable=or-true # idempotent cleanup: missing image is not an error + gcloud storage rm "gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" \ + --project="$GCP_PROJECT" 2>/dev/null # y-script-lint:disable=or-true # idempotent cleanup: missing object is not an error + "$Y_CLUSTER" teardown -c "$CFG_DIR" 2>/dev/null # y-script-lint:disable=or-true # idempotent cleanup: missing cluster is not an error + rm -rf "$BUNDLE_DIR" +} +trap teardown EXIT + +for tool in go qemu-system-x86_64 qemu-img kubectl ssh ssh-keygen curl virt-sysprep gcloud; do + command -v "$tool" >/dev/null \ + || { echo "missing required tool: $tool" >&2; exit 1; } +done + +if [[ ! -f "$GCP_KEY" ]]; then + echo "missing GCP key: $GCP_KEY" >&2 + echo "create it with: scripts/gcp-bootstrap-credentials.sh" >&2 + exit 1 +fi +# Same schema check as the interactive flow -- mismatch on a +# truncated / wrong-shape JSON should fail fast here rather +# than two gcloud calls in. +if ! jq -e 'all(.type == "service_account"; .) and (.project_id // empty | length > 0) and (.client_email // empty | length > 0) and (.private_key // empty | length > 0)' "$GCP_KEY" >/dev/null 2>&1; then + echo "GCP key at $GCP_KEY is missing required fields" >&2 + echo " expected JSON with: type=service_account, project_id, client_email, private_key" >&2 + exit 1 +fi +export GOOGLE_APPLICATION_CREDENTIALS="$GCP_KEY" + +# Acknowledge parallel composite uploads up front (silences +# the WARNING stanza gcloud would otherwise emit on every +# `storage cp` for files >150 MiB). +export CLOUDSDK_STORAGE_PARALLEL_COMPOSITE_UPLOAD_ENABLED=True + +if ! [ -r /boot/vmlinuz-"$(uname -r)" ]; then + cat >&2 </dev/null + +# === 1. Build dev binary === +stage "building dev binary -> $Y_CLUSTER" +mkdir -p "$(dirname "$Y_CLUSTER")" +( cd "$REPO_ROOT" && go build -o "$Y_CLUSTER" ./cmd/y-cluster ) + +# === 2. Provision local qemu === +mkdir -p "$CFG_DIR" +# YAML emission omits any port the operator didn't override, letting +# y-cluster's Go binary apply its own defaults (sshPort=2222, +# portForwards={6443:6443, 80:80, 443:443}). +{ + echo "provider: qemu" + echo "name: $NAME" + echo "context: $NAME" + [ -n "${APP_SSH_PORT:-}" ] && printf 'sshPort: "%s"\n' "$APP_SSH_PORT" + echo 'memory: "4096"' + echo 'cpus: "2"' + echo 'diskSize: "40G"' + if [ -n "${APP_HTTP_PORT:-}" ] || [ -n "${APP_HTTPS_PORT:-}" ] || [ -n "${APP_API_PORT:-}" ]; then + echo "portForwards:" + [ -n "${APP_API_PORT:-}" ] && printf ' - host: "%s"\n guest: "6443"\n' "$APP_API_PORT" + [ -n "${APP_HTTP_PORT:-}" ] && printf ' - host: "%s"\n guest: "80"\n' "$APP_HTTP_PORT" + [ -n "${APP_HTTPS_PORT:-}" ] && printf ' - host: "%s"\n guest: "443"\n' "$APP_HTTPS_PORT" + fi +} > "$CFG_DIR/y-cluster-provision.yaml" + +stage "tearing down any leftover $NAME cluster" +"$Y_CLUSTER" teardown -c "$CFG_DIR" || true # y-script-lint:disable=or-true # idempotent re-entry: missing cluster is not an error + +stage "provisioning $NAME (k3s + Envoy Gateway)" +"$Y_CLUSTER" provision -c "$CFG_DIR" + +# === 3. Install canonical workloads === +stage "installing echo workload" +"$Y_CLUSTER" echo render \ + | kubectl --context="$NAME" apply --server-side --field-manager=customer-install -f - +kubectl --context="$NAME" -n y-cluster wait \ + --for=condition=Available deployment/echo --timeout=180s + +stage "installing VersityGW StatefulSet via yconverge" +"$Y_CLUSTER" yconverge --context="$NAME" \ + -k "$REPO_ROOT/testdata/appliance-stateful/base" + +# === 4. Local smoketest === +stage "local smoketest: echo + s3" +probe_local() { + local what=$1 url=$2 attempts=${3:-30} + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o /dev/null -w " $what HTTP %{http_code}\n" "$url"; then + return 0 + fi + echo " $what attempt $i/$attempts" + sleep 5 + done + return 1 +} +probe_local echo "http://127.0.0.1:${APP_HTTP_PORT:-80}/q/envoy/echo" +probe_local s3 "http://127.0.0.1:${APP_HTTP_PORT:-80}/s3/health" + +# === 5. Stop === +stage "stopping cluster" +"$Y_CLUSTER" stop --context="$NAME" + +# === 6. prepare-export === +stage "prepare-export" +"$Y_CLUSTER" prepare-export --context="$NAME" + +# === 7. Export to GCE-tar === +stage "exporting GCE-custom-image tarball -> $BUNDLE_DIR" +# y-cluster export refuses non-empty bundle dirs; the mktemp -d +# above created an empty dir we own, so a fresh re-run is fine. +# On retry-after-failure paths the dir might have content from +# the previous attempt, so we wipe + let export recreate. +rm -rf "$BUNDLE_DIR" +"$Y_CLUSTER" export --context="$NAME" --format=gcp-tar "$BUNDLE_DIR" +TARBALL="$BUNDLE_DIR/$NAME.tar.gz" +echo " size: $(stat -c '%s' "$TARBALL" | numfmt --to=iec-i --suffix=B 2>/dev/null || stat -c '%s' "$TARBALL")" + +# === 8. Upload to GCS === +stage "ensuring bucket gs://$GCP_BUCKET ($GCP_REGION)" +if ! gcloud storage buckets describe "gs://$GCP_BUCKET" --project="$GCP_PROJECT" >/dev/null 2>&1; then + gcloud storage buckets create "gs://$GCP_BUCKET" \ + --project="$GCP_PROJECT" \ + --location="$GCP_REGION" \ + --uniform-bucket-level-access +fi + +stage "uploading -> gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" +gcloud storage cp "$TARBALL" "gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" --project="$GCP_PROJECT" + +# === 9. Create custom image === +stage "creating GCE custom image $IMAGE_NAME" +gcloud compute images create "$IMAGE_NAME" \ + --project="$GCP_PROJECT" \ + --source-uri="gs://$GCP_BUCKET/$IMAGE_NAME.tar.gz" \ + --family=y-cluster-appliance \ + --architecture=X86_64 \ + >/dev/null + +# === 10. Firewall (idempotent) === +FIREWALL_RULE="y-cluster-appliance-public" +stage "ensuring firewall rule $FIREWALL_RULE" +if ! gcloud compute firewall-rules describe "$FIREWALL_RULE" --project="$GCP_PROJECT" >/dev/null 2>&1; then + gcloud compute firewall-rules create "$FIREWALL_RULE" \ + --project="$GCP_PROJECT" \ + --direction=INGRESS \ + --network=default \ + --action=ALLOW \ + --rules=tcp:80,tcp:443 \ + --target-tags=y-cluster-appliance \ + --source-ranges=0.0.0.0/0 \ + >/dev/null +fi + +# === 11. Create VM === +stage "creating $VM_NAME ($GCP_MACHINE_TYPE in $GCP_ZONE)" +if gcloud compute instances describe "$VM_NAME" --project="$GCP_PROJECT" --zone="$GCP_ZONE" >/dev/null 2>&1; then + gcloud compute instances delete "$VM_NAME" \ + --project="$GCP_PROJECT" --zone="$GCP_ZONE" --quiet >/dev/null +fi +gcloud compute instances create "$VM_NAME" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --machine-type="$GCP_MACHINE_TYPE" \ + --image="$IMAGE_NAME" \ + --image-project="$GCP_PROJECT" \ + --boot-disk-size=20GB \ + --tags=y-cluster-appliance \ + >/dev/null +PUBLIC_IP=$(gcloud compute instances describe "$VM_NAME" \ + --project="$GCP_PROJECT" \ + --zone="$GCP_ZONE" \ + --format='get(networkInterfaces[0].accessConfigs[0].natIP)') +echo " public ip: $PUBLIC_IP" + +# === 12. Wait for ssh + probe HTTP === +SSH_KEY="$CACHE_DIR/$NAME-ssh" +SSH_OPTS="-i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5" + +echo " waiting for ssh on $PUBLIC_IP:22 (cloud-init can take 30-90s on first boot)" +ssh_up=0 +for i in $(seq 1 60); do + # shellcheck disable=SC2086 + if ssh $SSH_OPTS ystack@"$PUBLIC_IP" 'true' 2>/dev/null; then + echo " ssh up after $i attempt(s)" + ssh_up=1 + break + fi + echo " ssh attempt $i/60: not yet" + sleep 5 +done +[[ $ssh_up -eq 1 ]] || { echo "ssh never came up on $PUBLIC_IP" >&2; exit 1; } + +probe_remote() { + local what=$1 url=$2 attempts=${3:-60} + for i in $(seq 1 "$attempts"); do + if curl -fsS --max-time 8 -o /dev/null -w " $what HTTP %{http_code}\n" "$url"; then + return 0 + fi + echo " $what attempt $i/$attempts" + sleep 10 + done + return 1 +} + +stage "probing http://$PUBLIC_IP -- echo + s3 (same routes the local cluster served)" +if probe_remote echo "http://$PUBLIC_IP/q/envoy/echo" \ + && probe_remote s3 "http://$PUBLIC_IP/s3/health"; then + echo + echo "================================================================" + echo "PASS: appliance-qemu-to-gcp e2e." + echo + echo "Local-built disk booted in GCP and served the same routes that" + echo "the local qemu served. The appliance contract holds." + echo + echo " Public IP: $PUBLIC_IP" + echo " SSH: ssh -i $SSH_KEY ystack@$PUBLIC_IP" + echo " echo: http://$PUBLIC_IP/q/envoy/echo" + echo " s3 health: http://$PUBLIC_IP/s3/health" + echo "================================================================" + exit 0 +fi + +echo >&2 +echo "remote probes never returned; instance left for diagnosis (KEEP=1 to skip cleanup):" >&2 +# shellcheck disable=SC2086 +ssh $SSH_OPTS ystack@"$PUBLIC_IP" \ + 'sudo systemctl is-active k3s; sudo k3s kubectl get pods -A 2>&1 | head -30' >&2 \ + || true # y-script-lint:disable=or-true # diagnostic best-effort +exit 1 diff --git a/scripts/gcp-bootstrap-credentials.sh b/scripts/gcp-bootstrap-credentials.sh new file mode 100755 index 0000000..fdd29c7 --- /dev/null +++ b/scripts/gcp-bootstrap-credentials.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# Bootstrap a service-account JSON key for automation to use +# against a GCP project (typically your y-cluster appliance QA +# project; see .env.example for the operator-local default). +# +# Run this on a machine where you're already gcloud-logged-in +# as a project Owner (or Editor with IAM admin). It will: +# 1. Verify your active gcloud account can act on the project. +# 2. Enable the Compute / Storage APIs the appliance-qemu-to-gcp +# flow needs. (No Cloud Build: we convert qcow2 -> raw -> tar +# locally and use `images create --source-uri=gs://...`, which +# is a direct image create with no managed conversion job.) +# 3. Create (or reuse) a service account named +# @.iam.gserviceaccount.com. +# 4. Grant it roles/owner on the project. (QA project; broad +# role keeps the bootstrap simple. Tighten later if QA gets +# reused for non-QA assets.) +# 5. Generate a JSON key for the service account. +# 6. Print the JSON between unmistakable BEGIN/END markers so +# you can copy-paste from your terminal scrollback to the +# machine that needs the credentials. The key is also left +# on disk at $KEY_FILE in case you'd rather scp it. +# +# After copying: on the other machine, save the JSON between +# the markers (NOT the markers themselves) to a file, chmod +# 600 it, and point GCP_KEY in $REPO_ROOT/.env at it. The +# appliance scripts read GCP_KEY from .env. + +[ -z "$DEBUG" ] || set -x +set -eo pipefail + +YHELP='gcp-bootstrap-credentials.sh - create + grant + key a service account for the y-cluster appliance flow, then print the JSON for cross-machine copy-paste + +Usage: gcp-bootstrap-credentials.sh + +Environment: + GCP_PROJECT GCP project (set in .env or shell env; required) + SA_NAME Service account local part (default: y-cluster-appliance) + KEY_FILE Where to write the JSON key on this machine + (default: ./y-cluster-gcp-key.json) + DEBUG Set non-empty for bash trace + +Dependencies: + gcloud (logged in as a Project Owner or equivalent) +' + +case "${1:-}" in + help) echo "$YHELP"; exit 0 ;; + --help) echo "$YHELP"; exit 0 ;; + -h) echo "$YHELP"; exit 0 ;; +esac + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +if [[ -f "$REPO_ROOT/.env" ]]; then + set -o allexport; . "$REPO_ROOT/.env"; set +o allexport +fi + +: "${GCP_PROJECT:?set GCP_PROJECT in .env or shell env}" +PROJECT_ID="$GCP_PROJECT" +SA_NAME="${SA_NAME:-y-cluster-appliance}" +SA_EMAIL="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" +KEY_FILE="${KEY_FILE:-./y-cluster-gcp-key.json}" + +command -v gcloud >/dev/null || { echo "gcloud not found on PATH" >&2; exit 1; } + +stage() { printf '\n=== %s ===\n' "$*"; } + +# 1. Verify caller is logged in and can see the project. +stage "verifying gcloud auth + project access ($PROJECT_ID)" +ACTIVE=$(gcloud auth list --filter=status:ACTIVE --format="value(account)" 2>/dev/null || true) # y-script-lint:disable=or-true # gcloud returns nonzero when no active account; we surface our own error below +if [[ -z "$ACTIVE" ]]; then + echo "no active gcloud account; run: gcloud auth login" >&2 + exit 1 +fi +echo " active account: $ACTIVE" +gcloud projects describe "$PROJECT_ID" --format="value(projectId)" >/dev/null \ + || { echo "cannot read project $PROJECT_ID with $ACTIVE" >&2; exit 1; } + +# 2. Enable required APIs. Idempotent: gcloud reports the +# already-enabled ones as no-ops. +stage "enabling APIs (compute, storage)" +gcloud services enable \ + compute.googleapis.com \ + storage.googleapis.com \ + --project="$PROJECT_ID" + +# 3. Create the service account (idempotent: skip if it +# exists). gcloud doesn't ship a clean "create or skip", so +# we probe first. +stage "creating service account $SA_EMAIL (idempotent)" +if gcloud iam service-accounts describe "$SA_EMAIL" \ + --project="$PROJECT_ID" >/dev/null 2>&1; then + echo " already exists, reusing" +else + gcloud iam service-accounts create "$SA_NAME" \ + --display-name="y-cluster appliance automation" \ + --description="Used by scripts/appliance-qemu-to-gcp.sh to upload custom images and provision VMs in $PROJECT_ID" \ + --project="$PROJECT_ID" +fi + +# 4. Grant roles/owner on the project. QA project; broad role +# is intentional and matches the project's stated purpose. If +# this account ever gets reused for non-QA assets, tighten to +# the union of: compute.admin, storage.admin, +# iam.serviceAccountUser. +stage "granting roles/owner on $PROJECT_ID to $SA_EMAIL" +gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$SA_EMAIL" \ + --role="roles/owner" \ + --project="$PROJECT_ID" \ + --condition=None \ + >/dev/null + +# 5. Mint a fresh JSON key. Each invocation creates a new key. +# GCP allows up to 10 keys per service account; if the operator +# is rotating, they can `gcloud iam service-accounts keys list` +# and delete the stale ones with `keys delete`. +stage "minting JSON key -> $KEY_FILE" +rm -f "$KEY_FILE" +gcloud iam service-accounts keys create "$KEY_FILE" \ + --iam-account="$SA_EMAIL" \ + --project="$PROJECT_ID" +chmod 600 "$KEY_FILE" + +# 6. Print the JSON between markers for clipboard-friendly +# copy. Markers are exact strings the destination machine can +# grep for if they want to extract programmatically. +echo +echo "================================================================" +echo "JSON key for $SA_EMAIL" +echo "Project: $PROJECT_ID" +echo +echo "On the destination machine, save the lines BETWEEN the" +echo "----- BEGIN ... ----- and ----- END ... ----- markers" +echo "(NOT the markers themselves) to a file, then:" +echo " chmod 600 " +echo " set GCP_KEY= in \$REPO_ROOT/.env" +echo "================================================================" +echo +echo "----- BEGIN GCP SERVICE ACCOUNT KEY ($SA_EMAIL) -----" +cat "$KEY_FILE" +echo +echo "----- END GCP SERVICE ACCOUNT KEY ($SA_EMAIL) -----" +echo +echo "Local copy of the key (kept for scp / re-paste): $KEY_FILE" +echo "To revoke this key later:" +echo " gcloud iam service-accounts keys list --iam-account=$SA_EMAIL --project=$PROJECT_ID" +echo " gcloud iam service-accounts keys delete --iam-account=$SA_EMAIL --project=$PROJECT_ID"