diff --git a/applications/nfsserver/Dockerfile b/applications/nfsserver/Dockerfile index 09164480f..e772f79d5 100644 --- a/applications/nfsserver/Dockerfile +++ b/applications/nfsserver/Dockerfile @@ -4,22 +4,32 @@ FROM golang:1.26.2 AS provisioner WORKDIR /usr/src/app COPY nfs-subdir-external-provisioner ./ -RUN make +RUN go mod tidy && make + +# compile nfsvol (mount manager / watchdog) +FROM golang:1.26.2 AS nfsvol + +WORKDIR /usr/src/nfsvol +COPY nfsvol/ ./ +RUN go mod tidy && go build -o /usr/local/bin/nfsvol . # -FROM k8s.gcr.io/volume-nfs:0.8 +# Upstream continuation of the old k8s.gcr.io/volume-nfs:0.8 image, maintained +# by the Kubernetes project at test/images/volume/nfs. CentOS Stream 9 base +# with nfs-utils already installed; we only need to add e2fsprogs for mkfs.ext4. +FROM rockylinux/rockylinux:10.1-minimal -RUN yum install -y \ - e2fsprogs \ - nfs-utils \ - rpcbind \ - && yum clean all \ - && rm -rf /var/cache/yum +RUN microdnf install -y \ + e2fsprogs \ + rpcbind \ + nfs-utils \ + && microdnf clean all COPY --from=provisioner /usr/src/app/bin/nfs-subdir-external-provisioner /usr/local/bin/nfs-subdir-external-provisioner +COPY --from=nfsvol /usr/local/bin/nfsvol /usr/local/bin/nfsvol COPY resources/*.sh /usr/local/bin/ -RUN chmod +x /usr/local/bin/*.sh +RUN chmod +x /usr/local/bin/*.sh /usr/local/bin/nfsvol RUN echo "/exports *(rw,fsid=0,insecure,no_subtree_check,no_root_squash,crossmnt)" > /etc/exports diff --git a/applications/nfsserver/README-PROD.md b/applications/nfsserver/README-PROD.md new file mode 100644 index 000000000..1e32a5c2f --- /dev/null +++ b/applications/nfsserver/README-PROD.md @@ -0,0 +1,106 @@ +# NFS Server — Production Caveats + +This NFS server is a pragmatic, single-replica design that trades some +durability for operational simplicity. The defaults favour **client-pod +availability over write durability**: during a server outage, applications +see `EIO` and are expected to retry, rather than hanging indefinitely. + +Read this before relying on it for production workloads. + +## Architecture + +- One NFS server pod (single-replica Deployment, `strategy: Recreate`). +- Backing storage: one RWO PVC (`nfs-exports`) holding per-PVC ext4 loopback + files at `/exports/.quota`, mounted at `/exports/`. +- Clients mount via a stable ClusterIP Service + DNS FQDN, with + `soft,nolock,local_lock=all,nfsvers=3`. +- Per-PVC exports are written to `/etc/exports.d/` with a deterministic + SHA-256-derived fsid, so client file handles survive server pod restarts. + +## Outage behaviour + +| Trigger | Outage duration (for active client pods) | Client-visible error | +|---|---|---| +| `kubectl delete pod nfs-server-...` | seconds (kube reschedules immediately) | brief `EIO`, then resumes transparently (fsid stable) | +| `kubectl rollout restart` | seconds — Recreate waits for old pod first | brief `EIO`, then resumes | +| Graceful node drain of NFS node | ~30–60 s (PVC detach + reattach) | brief `EIO`, then resumes | +| **Ungraceful node loss** (node crash, network partition) | **up to ~6 minutes** (force-detach timeout) | `EIO` repeatedly until pod reattaches on another node | +| Loopback goes stale on same host (rare) | up to 30 s (watchdog period) | transparent, clients do not notice | + +The ~6 minute ungraceful-loss window is inherent to RWO storage with +cloud-provider CSI drivers and cannot be eliminated without switching to a +different storage strategy (see "Not suitable for" below). + +## Application requirements + +Applications that use PVCs from this provisioner **must**: + +- Tolerate `EIO` on reads and writes. Retry with backoff. The current config + uses `soft` mount semantics — I/O returns an error rather than hanging. +- Not rely on POSIX file-range locking (`flock`, `fcntl`) across pods. + `nolock,local_lock=all` disables cross-client locking. Shared-writer + workloads (e.g. SQLite, cooperating text editors) will race silently. +- Not assume write-through durability during an outage. In-flight writes + that return `EIO` may or may not have reached disk. + +## What these caveats rule out + +This backend is **not suitable for**: + +- Databases that require fsync durability semantics (use a proper database + PVC, not NFS). +- Workloads with multiple writers to the same file across nodes. +- Strict HA requirements (no failover during ungraceful node loss). +- Large cross-region deployments (single RWO PVC is region-local). + +It **is suitable for**: + +- Shared read-only / append-only data between pods (logs, content). +- Cache / scratch volumes where a brief `EIO` is retryable. +- Shared artifact storage between producer and consumer pods. + +## Operator responsibilities + +### Backup + +`nfs-exports` is a single cloud PVC with no built-in backup. If lost, every +NFS-backed PVC in the cluster is lost. Operators must: + +- Schedule snapshots of `nfs-exports` (cloud-provider-specific). +- Store snapshots in a separate region/account for real DR. + +Neither `nfsvol` nor the provisioner automates this. It is intentional; DR +policy is a per-deployment decision. + +### Graceful node migration + +To move the NFS server pod to a different node: + +1. Cordon the target node preferences as needed. +2. Either cordon+drain the source node (standard flow), or +3. `kubectl delete pod nfs-server-...` — the pod terminates, PVC detaches, + and a new pod schedules on any eligible node. + +With `podDisruptionBudget.enabled: true` in `values.yaml`, `kubectl drain` +will be blocked by the PDB. This is intentional — forces the operator to +use the explicit delete-pod flow so automated tooling does not evict +unaware. + +### Monitoring + +- `/healthz` on port 8080 exposes the watchdog health (readiness and + liveness probes already consume this). +- Watch for `watchdog: remount failed` log lines — indicates the loopback + layer is inconsistent with `/exports/*.quota`. +- Watch for `mount-all: N of M mounts failed` at startup. + +## Not addressed by this iteration + +- Multi-region / cross-cluster replication. +- Automated snapshot scheduling. +- Active/passive HA (would need shared block storage + fencing, or a move + to a managed NFS service — EFS, Filestore, Azure Files). + +If any of those become requirements, switch to a managed NFS or a proper +CSI driver. This backend was designed for small, single-region, best-effort +shared storage. diff --git a/applications/nfsserver/deploy/templates/nfs-server.yaml b/applications/nfsserver/deploy/templates/nfs-server.yaml index 32570713c..f3166ca23 100644 --- a/applications/nfsserver/deploy/templates/nfs-server.yaml +++ b/applications/nfsserver/deploy/templates/nfs-server.yaml @@ -19,6 +19,11 @@ metadata: app: nfs-server usesvolume: nfs-exports spec: + # Recreate strategy: with an RWO PVC, a surge pod can never attach while the + # old pod holds nfs-exports, so RollingUpdate would deadlock. Single-replica + # by design. + strategy: + type: Recreate selector: matchLabels: app: nfs-server @@ -29,6 +34,7 @@ spec: app: nfs-server usesvolume: nfs-exports spec: + serviceAccountName: {{ template "nfs-subdir-external-provisioner.serviceAccountName" . }} affinity: podAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -53,6 +59,11 @@ spec: value: {{ .Values.apps.nfsserver.nfs.path }} - name: PROVISIONER_NAME value: {{ printf "%s-nfs-provisioner" .Values.namespace }} + # Single-replica Deployment backed by an RWO PVC — only one pod can + # ever mount /exports, so leader election is unnecessary and would + # otherwise need v1 Endpoints RBAC (deprecated in k8s 1.33+). + - name: ENABLE_LEADER_ELECTION + value: "false" ports: - name: nfs containerPort: 2049 @@ -60,6 +71,27 @@ spec: containerPort: 20048 - name: rpcbind containerPort: 111 + - name: healthz + containerPort: 8080 + startupProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 30 + failureThreshold: 240 + readinessProbe: + httpGet: + path: /ready + port: 8080 + periodSeconds: 15 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + periodSeconds: 30 + failureThreshold: 5 securityContext: privileged: true volumeMounts: diff --git a/applications/nfsserver/deploy/templates/pdb.yaml b/applications/nfsserver/deploy/templates/pdb.yaml new file mode 100644 index 000000000..f7cdefaa1 --- /dev/null +++ b/applications/nfsserver/deploy/templates/pdb.yaml @@ -0,0 +1,21 @@ +{{- if .Values.apps.nfsserver.podDisruptionBudget.enabled }} +# A PDB with minAvailable: 1 on a single-replica RWO-PVC-backed Deployment +# blocks kubectl drain / automated node-group rotations — by design. It is a +# guardrail forcing the operator to consciously migrate the NFS server pod +# (terminate → wait for PVC detach → new pod attaches on target node) rather +# than letting tooling evict it blindly. +# +# This does NOT help with ungraceful node loss (which goes via the ~6-minute +# force-detach path, not the voluntary-disruption path). +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: nfs-server + labels: + app: nfs-server +spec: + minAvailable: 1 + selector: + matchLabels: + app: nfs-server +{{- end }} diff --git a/applications/nfsserver/deploy/values.yaml b/applications/nfsserver/deploy/values.yaml index 713ee0ec2..7a90d8cd0 100644 --- a/applications/nfsserver/deploy/values.yaml +++ b/applications/nfsserver/deploy/values.yaml @@ -4,7 +4,7 @@ harness: auto: false deployment: auto: false - image: gcr.io/metacellllc/cloudharness/nfsserver:1.0 + # image: gcr.io/metacellllc/cloudharness/nfsserver:1.0 # nfs server pvc disk size (/exports) @@ -32,8 +32,15 @@ nfs: useDNS: true path: /exports # /exports mountOptions: + # `soft` overrides the Linux client default (`hard`). Without it, applications + # block in uninterruptible sleep when the NFS server is unreachable. With + # `soft`, I/O returns EIO after a few retries — applications stay alive and + # can retry. This matches the availability-over-integrity tradeoff. + - soft - nolock - local_lock=all + - timeo=50 # 5 s per retry + - retrans=2 # 2 retries → ~15 s before EIO during server downtime volumeName: nfs-subdir-external-provisioner-root # Reclaim policy for the main nfs volume reclaimPolicy: Retain @@ -84,6 +91,14 @@ leaderElection: # When set to false leader election will be disabled enabled: true +# PodDisruptionBudget guardrail against accidental evictions. +# Off by default — enabling it blocks `kubectl drain` of the NFS server's +# node until the operator terminates the pod manually (RWO PVC means there +# is no surge pod that could satisfy minAvailable=1 while the old one runs). +# Does NOT help with ungraceful node loss (force-detach path). +podDisruptionBudget: + enabled: false + ## For RBAC support: rbac: # Specifies whether RBAC resources should be created diff --git a/applications/nfsserver/nfs-subdir-external-provisioner/cmd/nfs-subdir-external-provisioner/provisioner.go b/applications/nfsserver/nfs-subdir-external-provisioner/cmd/nfs-subdir-external-provisioner/provisioner.go index 09e1df085..7f9b4e68c 100644 --- a/applications/nfsserver/nfs-subdir-external-provisioner/cmd/nfs-subdir-external-provisioner/provisioner.go +++ b/applications/nfsserver/nfs-subdir-external-provisioner/cmd/nfs-subdir-external-provisioner/provisioner.go @@ -18,7 +18,6 @@ package main import ( "context" - "errors" "flag" "fmt" "os" @@ -42,12 +41,16 @@ import ( const ( provisionerNameKey = "PROVISIONER_NAME" + annotationPrefix = "k8s-sigs.io" ) type nfsProvisioner struct { - client kubernetes.Interface - server string - path string + client kubernetes.Interface + server string + path string + defaultMode os.FileMode + defaultUid int + defaultGid int } type pvcMetadata struct { @@ -85,8 +88,6 @@ func (p *nfsProvisioner) Provision(ctx context.Context, options controller.Provi return nil, controller.ProvisioningFinished, fmt.Errorf("claim Selector is not supported") } glog.V(4).Infof("nfs provisioner: VolumeOptions %v", options) - capacity := options.PVC.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)] - requestBytes := capacity.Value() pvcNamespace := options.PVC.Namespace pvcName := options.PVC.Name @@ -114,20 +115,15 @@ func (p *nfsProvisioner) Provision(ctx context.Context, options controller.Provi } } + // Retrieve requested storage size for the quota-backed loopback filesystem. + capacity := options.PVC.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)] + requestBytes := capacity.Value() - // glog.V(4).Infof("creating path %s", fullPath) - // if err := os.MkdirAll(fullPath, 0o777); err != nil { - // return nil, controller.ProvisioningFinished, errors.New("unable to create directory to provision new pv: " + err.Error()) - // } - - // if err := os.Chmod(fullPath, 0o777); err != nil { - // return nil, "", err - // } - - cmd := exec.Command("/usr/local/bin/mklimdir.sh", "-m", fullPath, "-s", strconv.FormatInt(requestBytes, 10)) - if err := cmd.Run(); err != nil { - return nil, controller.ProvisioningFinished, errors.New("unable to create directory to provision new pv: " + err.Error()) - } + glog.V(4).Infof("creating quota-backed path %s (%d bytes)", fullPath, requestBytes) + cmd := exec.Command("/usr/local/bin/nfsvol", "create", "-m", fullPath, "-s", strconv.FormatInt(requestBytes, 10)) + if out, err := cmd.CombinedOutput(); err != nil { + return nil, controller.ProvisioningFinished, fmt.Errorf("unable to create directory to provision new pv: %w\n%s", err, out) + } pv := &v1.PersistentVolume{ ObjectMeta: metav1.ObjectMeta{ @@ -155,22 +151,25 @@ func (p *nfsProvisioner) Provision(ctx context.Context, options controller.Provi func (p *nfsProvisioner) Delete(ctx context.Context, volume *v1.PersistentVolume) error { path := volume.Spec.PersistentVolumeSource.NFS.Path basePath := filepath.Base(path) - oldPath := filepath.Join(mountPath, basePath) + // Derive the local path by substituting the NFS export root with the local mount path. + oldPath := strings.Replace(path, p.path, mountPath, 1) if _, err := os.Stat(oldPath); os.IsNotExist(err) { glog.Warningf("path %s does not exist, deletion skipped", oldPath) return nil } + // Get the storage class for this volume. storageClass, err := p.getClassForVolume(ctx, volume) if err != nil { return err } - // delete the loop back device - cmd := exec.Command("/usr/local/bin/rmlimdir.sh", "-m", oldPath) - if err := cmd.Run(); err != nil { - return err + // Unmount the quota-backed loopback and rename the .quota file to the mountpoint path, + // leaving a plain file that can be archived or removed by the logic below. + cmd := exec.Command("/usr/local/bin/nfsvol", "delete", "-m", oldPath) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("nfsvol delete: %w\n%s", err, out) } // Determine if the "onDelete" parameter exists. @@ -220,20 +219,46 @@ func (p *nfsProvisioner) getClassForVolume(ctx context.Context, pv *v1.Persisten return class, nil } +func getModeFromString(mode string) (os.FileMode, error) { + if mode == "" { + return os.FileMode(0o777), nil + } + modeInt, err := strconv.ParseInt(mode, 8, 64) + if err != nil { + return 0, fmt.Errorf("invalid mode %s: %v", mode, err) + } + if modeInt < 0 || modeInt > 0o777 { + return 0, fmt.Errorf("mode must be between 0 and 0777, got %s", mode) + } + return os.FileMode(modeInt), nil +} + +func getIdFromString(id string) (int, error) { + if id == "" { + return 0, nil + } + idInt, err := strconv.Atoi(id) + if err != nil { + return 0, fmt.Errorf("invalid id %s: %v", id, err) + } + if idInt < 0 || idInt > 65535 { + return 0, fmt.Errorf("id must be between 0 and 65535, got %s", id) + } + return idInt, nil +} + func main() { flag.Parse() flag.Set("logtostderr", "true") server := os.Getenv("NFS_SERVER") if server == "" { - // 2022-11-04 ZS: use the internal K8s service host env variable - // for getting the ip address of the nfs server + // Fall back to the in-cluster service host env variable injected by Kubernetes. server = os.Getenv("NFS_SERVER_SERVICE_HOST") if server == "" { glog.Fatal("NFS_SERVER and NFS_SERVER_SERVICE_HOST are both not set") - } else { - glog.Infof("Using NFS Server: %s", server) } + glog.Infof("NFS_SERVER not set, using NFS_SERVER_SERVICE_HOST: %s", server) } path := os.Getenv("NFS_PATH") if path == "" { @@ -243,20 +268,28 @@ func main() { if provisionerName == "" { glog.Fatalf("environment variable %s is not set! Please set it.", provisionerNameKey) } + + mode, err := getModeFromString(os.Getenv("NFS_DEFAULT_MODE")) + if err != nil { + glog.Fatalf("Failed to parse NFS_DEFAULT_MODE: %v", err) + } + uid, err := getIdFromString(os.Getenv("NFS_DEFAULT_UID")) + if err != nil { + glog.Fatalf("Failed to parse NFS_DEFAULT_UID: %v", err) + } + gid, err := getIdFromString(os.Getenv("NFS_DEFAULT_GID")) + if err != nil { + glog.Fatalf("Failed to parse NFS_DEFAULT_GID: %v", err) + } + kubeconfig := os.Getenv("KUBECONFIG") var config *rest.Config if kubeconfig != "" { - // Create an OutOfClusterConfig and use it to create a client for the controller - // to use to communicate with Kubernetes - var err error config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) if err != nil { glog.Fatalf("Failed to create kubeconfig: %v", err) } } else { - // Create an InClusterConfig and use it to create a client for the controller - // to use to communicate with Kubernetes - var err error config, err = rest.InClusterConfig() if err != nil { glog.Fatalf("Failed to create config: %v", err) @@ -284,12 +317,14 @@ func main() { } clientNFSProvisioner := &nfsProvisioner{ - client: clientset, - server: server, - path: path, + client: clientset, + server: server, + path: path, + defaultMode: mode, + defaultUid: uid, + defaultGid: gid, } - // Start the provision controller which will dynamically provision efs NFS - // PVs + // Start the provision controller which will dynamically provision NFS PVs. pc := controller.NewProvisionController(clientset, provisionerName, clientNFSProvisioner, diff --git a/applications/nfsserver/nfs-subdir-external-provisioner/go.mod b/applications/nfsserver/nfs-subdir-external-provisioner/go.mod index d65f1c46c..97ed417cb 100644 --- a/applications/nfsserver/nfs-subdir-external-provisioner/go.mod +++ b/applications/nfsserver/nfs-subdir-external-provisioner/go.mod @@ -1,95 +1,97 @@ module github.com/kubernetes-sigs/nfs-subdir-external-provisioner -go 1.24.0 +go 1.26 + +toolchain go1.26.0 require ( github.com/golang/glog v1.2.5 - k8s.io/api v0.23.17 - k8s.io/apimachinery v0.23.17 - k8s.io/client-go v0.23.17 - k8s.io/component-helpers v0.23.17 - sigs.k8s.io/sig-storage-lib-external-provisioner/v6 v6.0.0 + k8s.io/api v0.35.1 + k8s.io/apimachinery v0.35.1 + k8s.io/client-go v0.35.1 + k8s.io/component-helpers v0.35.0 + sigs.k8s.io/sig-storage-lib-external-provisioner/v6 v6.3.0 ) require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/logr v1.4.3 // indirect - github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect - github.com/golang/protobuf v1.5.4 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.23.0 // indirect + github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/gofuzz v1.1.0 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/googleapis/gnostic v0.5.5 // indirect - github.com/imdario/mergo v0.3.5 // indirect + github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect github.com/miekg/dns v1.1.29 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/prometheus/client_golang v1.21.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/prometheus/client_golang v1.20.4 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.63.0 // indirect - github.com/prometheus/procfs v0.16.0 // indirect - github.com/spf13/pflag v1.0.5 // indirect - golang.org/x/crypto v0.45.0 // indirect - golang.org/x/net v0.48.0 // indirect - golang.org/x/oauth2 v0.34.0 // indirect - golang.org/x/sys v0.39.0 // indirect - golang.org/x/term v0.37.0 // indirect - golang.org/x/text v0.32.0 // indirect - golang.org/x/time v0.11.0 // indirect - google.golang.org/protobuf v1.36.10 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect + github.com/spf13/pflag v1.0.9 // indirect + github.com/x448/float16 v0.8.4 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/crypto v0.48.0 // indirect + golang.org/x/net v0.49.0 // indirect + golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sys v0.41.0 // indirect + golang.org/x/term v0.40.0 // indirect + golang.org/x/text v0.34.0 // indirect + golang.org/x/time v0.9.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/klog v1.0.0 // indirect - k8s.io/klog/v2 v2.30.0 // indirect - k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65 // indirect - k8s.io/utils v0.0.0-20211116205334-6203023598ed // indirect - sigs.k8s.io/json v0.0.0-20211020170558-c049b76a60c6 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect - sigs.k8s.io/yaml v1.2.0 // indirect + k8s.io/klog/v2 v2.130.1 // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) replace ( - github.com/elazarl/goproxy => github.com/elazarl/goproxy v0.0.0-20230731152917-f99041a5c027 - github.com/emicklei/go-restful => github.com/emicklei/go-restful v2.16.0+incompatible - github.com/getkin/kin-openapi => github.com/getkin/kin-openapi v0.131.0 - github.com/prometheus/client_golang => github.com/prometheus/client_golang v1.21.1 - github.com/sirupsen/logrus => github.com/sirupsen/logrus v1.9.3 - golang.org/x/crypto => golang.org/x/crypto v0.45.0 - golang.org/x/image => golang.org/x/image v0.18.0 - golang.org/x/oauth2 => golang.org/x/oauth2 v0.27.0 - google.golang.org/grpc => google.golang.org/grpc v1.79.3 - gopkg.in/yaml.v3 => gopkg.in/yaml.v3 v3.0.1 - k8s.io/api => k8s.io/api v0.23.17 - k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.23.17 - k8s.io/apimachinery => k8s.io/apimachinery v0.23.17 - k8s.io/apiserver => k8s.io/apiserver v0.23.17 - k8s.io/cli-runtime => k8s.io/cli-runtime v0.23.17 - k8s.io/client-go => k8s.io/client-go v0.23.17 - k8s.io/cloud-provider => k8s.io/cloud-provider v0.23.17 - k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.23.17 - k8s.io/code-generator => k8s.io/code-generator v0.23.17 - k8s.io/component-base => k8s.io/component-base v0.23.17 - k8s.io/component-helpers => k8s.io/component-helpers v0.23.17 - k8s.io/controller-manager => k8s.io/controller-manager v0.23.17 - k8s.io/cri-api => k8s.io/cri-api v0.23.17 - k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.23.17 - k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.23.17 - k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.23.17 - k8s.io/kube-proxy => k8s.io/kube-proxy v0.23.17 - k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.23.17 - k8s.io/kubectl => k8s.io/kubectl v0.23.17 - k8s.io/kubelet => k8s.io/kubelet v0.23.17 - k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.23.17 - k8s.io/metrics => k8s.io/metrics v0.23.17 - k8s.io/mount-utils => k8s.io/mount-utils v0.23.17 - k8s.io/pod-security-admission => k8s.io/pod-security-admission v0.23.17 - k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.23.17 - k8s.io/sample-cli-plugin => k8s.io/sample-cli-plugin v0.23.17 - k8s.io/sample-controller => k8s.io/sample-controller v0.23.17 + github.com/prometheus/client_golang => github.com/prometheus/client_golang v1.23.2 + golang.org/x/crypto => golang.org/x/crypto v0.48.0 + golang.org/x/net => golang.org/x/net v0.50.0 + gopkg.in/yaml.v3 => gopkg.in/yaml.v3 v3.0.0-20220521103104-8f96da9f5d5e + k8s.io/api => k8s.io/api v0.35.1 + k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.35.1 + k8s.io/apimachinery => k8s.io/apimachinery v0.35.1 + k8s.io/apiserver => k8s.io/apiserver v0.35.1 + k8s.io/cli-runtime => k8s.io/cli-runtime v0.35.1 + k8s.io/client-go => k8s.io/client-go v0.35.1 + k8s.io/cloud-provider => k8s.io/cloud-provider v0.35.1 + k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.35.1 + k8s.io/code-generator => k8s.io/code-generator v0.35.1 + k8s.io/component-base => k8s.io/component-base v0.35.1 + k8s.io/component-helpers => k8s.io/component-helpers v0.35.1 + k8s.io/controller-manager => k8s.io/controller-manager v0.35.1 + k8s.io/cri-api => k8s.io/cri-api v0.35.1 + k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.35.1 + k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.35.1 + k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.35.1 + k8s.io/kube-proxy => k8s.io/kube-proxy v0.35.1 + k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.35.1 + k8s.io/kubectl => k8s.io/kubectl v0.35.1 + k8s.io/kubelet => k8s.io/kubelet v0.35.1 + k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.35.1 + k8s.io/metrics => k8s.io/metrics v0.35.1 + k8s.io/mount-utils => k8s.io/mount-utils v0.35.1 + k8s.io/pod-security-admission => k8s.io/pod-security-admission v0.35.1 + k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.35.1 + k8s.io/sample-cli-plugin => k8s.io/sample-cli-plugin v0.35.1 + k8s.io/sample-controller => k8s.io/sample-controller v0.35.1 ) diff --git a/applications/nfsserver/nfsvol/go.mod b/applications/nfsserver/nfsvol/go.mod new file mode 100644 index 000000000..94440cf26 --- /dev/null +++ b/applications/nfsserver/nfsvol/go.mod @@ -0,0 +1,5 @@ +module metacell/nfsvol + +go 1.24 + +require golang.org/x/sys v0.33.0 diff --git a/applications/nfsserver/nfsvol/internal/loop/loop.go b/applications/nfsserver/nfsvol/internal/loop/loop.go new file mode 100644 index 000000000..584c94145 --- /dev/null +++ b/applications/nfsserver/nfsvol/internal/loop/loop.go @@ -0,0 +1,195 @@ +package loop + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "golang.org/x/sys/unix" +) + +const loopControlPath = "/dev/loop-control" + +// GetFree returns the index of the next free loop device as allocated by the kernel. +// The kernel atomically reserves the device so concurrent callers get distinct indices. +func GetFree() (int, error) { + ctrl, err := os.OpenFile(loopControlPath, os.O_RDWR, 0) + if err != nil { + return 0, fmt.Errorf("open %s: %w", loopControlPath, err) + } + defer ctrl.Close() + n, err := unix.IoctlRetInt(int(ctrl.Fd()), unix.LOOP_CTL_GET_FREE) + if err != nil { + return 0, fmt.Errorf("LOOP_CTL_GET_FREE: %w", err) + } + return n, nil +} + +// DevPath returns the device path for loop index n. +func DevPath(n int) string { + return fmt.Sprintf("/dev/loop%d", n) +} + +// EnsureDevice creates the block device node for /dev/loopN if it does not exist. +func EnsureDevice(n int) error { + path := DevPath(n) + if _, err := os.Stat(path); err == nil { + return nil + } + dev := unix.Mkdev(7, uint32(n)) + return unix.Mknod(path, unix.S_IFBLK|0666, int(dev)) +} + +// Attach attaches backingFile to a free loop device and returns the loop device path. +func Attach(backingFile string) (string, error) { + n, err := GetFree() + if err != nil { + return "", err + } + if err := EnsureDevice(n); err != nil { + return "", fmt.Errorf("ensure device loop%d: %w", n, err) + } + loopPath := DevPath(n) + + bf, err := os.OpenFile(backingFile, os.O_RDWR, 0) + if err != nil { + return "", fmt.Errorf("open backing file %s: %w", backingFile, err) + } + defer bf.Close() + + lf, err := os.OpenFile(loopPath, os.O_RDWR, 0) + if err != nil { + return "", fmt.Errorf("open loop device %s: %w", loopPath, err) + } + defer lf.Close() + + if err := unix.IoctlSetInt(int(lf.Fd()), unix.LOOP_SET_FD, int(bf.Fd())); err != nil { + return "", fmt.Errorf("LOOP_SET_FD on %s: %w", loopPath, err) + } + return loopPath, nil +} + +// Detach disassociates the backing file from loopPath. +func Detach(loopPath string) error { + lf, err := os.OpenFile(loopPath, os.O_RDONLY, 0) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return fmt.Errorf("open %s: %w", loopPath, err) + } + defer lf.Close() + if err := unix.IoctlSetInt(int(lf.Fd()), unix.LOOP_CLR_FD, 0); err != nil { + return fmt.Errorf("LOOP_CLR_FD on %s: %w", loopPath, err) + } + return nil +} + +// FindByBacking returns the first loop device path backed by the given file, +// or "" if none is found. +func FindByBacking(backingFile string) (string, error) { + abs, err := filepath.Abs(backingFile) + if err != nil { + abs = backingFile + } + entries, err := filepath.Glob("/sys/block/loop*/loop/backing_file") + if err != nil { + return "", err + } + for _, entry := range entries { + data, err := os.ReadFile(entry) + if err != nil { + continue + } + backing := strings.TrimSpace(string(data)) + backing = strings.TrimSuffix(backing, " (deleted)") + if backing == abs { + parts := strings.Split(entry, "/") + return "/dev/" + parts[3], nil + } + } + return "", nil +} + +// DetachByBacking finds and detaches ALL loop devices backing the given file. +// The NFS server pod shares the host mount namespace; if the previous pod was +// killed without graceful shutdown the loop device may still be mounted in the +// host namespace, causing LOOP_CLR_FD to fail with EBUSY. A lazy unmount of +// the associated mountpoint (backing path without ".quota" suffix) is attempted +// first so LOOP_CLR_FD can succeed. +// It is a no-op if no loop device is associated with the file. +func DetachByBacking(backingFile string) error { + abs, err := filepath.Abs(backingFile) + if err != nil { + abs = backingFile + } + entries, _ := filepath.Glob("/sys/block/loop*/loop/backing_file") + for _, entry := range entries { + data, err := os.ReadFile(entry) + if err != nil { + continue + } + backing := strings.TrimSpace(string(data)) + backing = strings.TrimSuffix(backing, " (deleted)") + if backing == abs { + // Lazy-unmount the mountpoint so the loop device is no longer + // "in use" before calling LOOP_CLR_FD. + mountpoint := strings.TrimSuffix(abs, ".quota") + _ = unix.Unmount(mountpoint, unix.MNT_DETACH) + parts := strings.Split(entry, "/") + _ = Detach("/dev/" + parts[3]) + } + } + return nil +} + +// CleanStale detaches all loop devices whose kernel-reported backing file is marked deleted. +func CleanStale() { + entries, _ := filepath.Glob("/sys/block/loop*/loop/backing_file") + for _, entry := range entries { + data, err := os.ReadFile(entry) + if err != nil { + continue + } + if strings.Contains(string(data), "(deleted)") { + parts := strings.Split(entry, "/") + _ = Detach("/dev/" + parts[3]) + } + } +} + +// CleanByFiles detaches ALL loop devices whose backing file is in the given set. +// It builds the loop->backing index exactly once (O(total_loop_devices)), making +// it efficient for bulk cleanup at startup regardless of how many files are passed. +// A lazy unmount of each associated mountpoint is attempted before LOOP_CLR_FD so +// that zombie mounts left by a killed previous pod do not block the detach. +func CleanByFiles(backingFiles []string) { + want := make(map[string]bool, len(backingFiles)) + for _, f := range backingFiles { + abs, err := filepath.Abs(f) + if err != nil { + abs = f + } + want[abs] = true + } + + entries, _ := filepath.Glob("/sys/block/loop*/loop/backing_file") + for _, entry := range entries { + data, err := os.ReadFile(entry) + if err != nil { + continue + } + backing := strings.TrimSpace(string(data)) + backing = strings.TrimSuffix(backing, " (deleted)") + if want[backing] { + // Lazy-unmount the mountpoint before detaching. The NFS server pod + // shares the host mount namespace; zombie mounts from the previous + // pod keep the loop device in-use and cause LOOP_CLR_FD to fail. + mountpoint := strings.TrimSuffix(backing, ".quota") + _ = unix.Unmount(mountpoint, unix.MNT_DETACH) + parts := strings.Split(entry, "/") + _ = Detach("/dev/" + parts[3]) + } + } +} diff --git a/applications/nfsserver/nfsvol/internal/mount/exports.go b/applications/nfsserver/nfsvol/internal/mount/exports.go new file mode 100644 index 000000000..893fbb86a --- /dev/null +++ b/applications/nfsserver/nfsvol/internal/mount/exports.go @@ -0,0 +1,118 @@ +package mount + +import ( + "crypto/sha256" + "encoding/binary" + "fmt" + "log" + "os" + "os/exec" + "path/filepath" + "strings" +) + +// Directory scanned by rpc.mountd for fragment exports files. +const exportsConfigDir = "/etc/exports.d" + +// fsidFromPVName derives a stable uint32 NFS fsid from the given PV name. +// 0 and 1 are reserved by NFS (0 = NFSv4 pseudo-root, 1 = unused) so values +// in that range are bumped out of the reserved zone. +// +// Collision probability with SHA-256 → 32-bit is ~N²/2^33, which is +// ~2×10⁻⁵ at N=10⁴ and ~1×10⁻¹ at N=10⁶. If the provisioner ever needs to +// handle >10⁴ active PVCs, switch to NFSv4 `fsid=` syntax. +func fsidFromPVName(pvName string) uint32 { + h := sha256.Sum256([]byte(pvName)) + fsid := binary.BigEndian.Uint32(h[:4]) + if fsid < 2 { + fsid += 2 + } + return fsid +} + +// pvExportFile returns the /etc/exports.d/ fragment path for a given PV name. +func pvExportFile(pvName string) string { + return filepath.Join(exportsConfigDir, pvName+".exports") +} + +// WriteExport writes (or replaces) the exports fragment for a mountpoint with +// a deterministic fsid so client file handles remain valid across server +// restarts, pod reschedules, and watchdog remounts. +func WriteExport(mountpoint string) error { + pvName := filepath.Base(mountpoint) + fsid := fsidFromPVName(pvName) + + if err := os.MkdirAll(exportsConfigDir, 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", exportsConfigDir, err) + } + + contents := fmt.Sprintf( + "%s *(rw,fsid=%d,insecure,no_subtree_check,no_root_squash)\n", + mountpoint, fsid, + ) + + dst := pvExportFile(pvName) + tmp := dst + ".tmp" + if err := os.WriteFile(tmp, []byte(contents), 0644); err != nil { + return fmt.Errorf("write %s: %w", tmp, err) + } + if err := os.Rename(tmp, dst); err != nil { + _ = os.Remove(tmp) + return fmt.Errorf("rename %s → %s: %w", tmp, dst, err) + } + return nil +} + +// RemoveExport removes the exports fragment for a mountpoint. Idempotent. +func RemoveExport(mountpoint string) error { + pvName := filepath.Base(mountpoint) + if err := os.Remove(pvExportFile(pvName)); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +// ReloadExports has the kernel re-read /etc/exports and /etc/exports.d/. +// Safe to call whether or not rpc.mountd is running yet. +func ReloadExports() error { + cmd := exec.Command("exportfs", "-r") + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("exportfs -r: %w\n%s", err, out) + } + return nil +} + +// RegenerateExportsFromQuotas rewrites /etc/exports.d/ to match the set of +// *.quota files under exportsBase, removing fragments for volumes that no +// longer exist. Used by mount-all at startup to converge the exports table +// with reality — crucial after a node reschedule where the previous pod's +// exports.d may have stale entries (or none at all, on a fresh node). +func RegenerateExportsFromQuotas(exportsBase string) error { + quotaFiles, err := filepath.Glob(filepath.Join(exportsBase, "*.quota")) + if err != nil { + return err + } + + wanted := make(map[string]bool, len(quotaFiles)) + for _, qf := range quotaFiles { + mp := strings.TrimSuffix(qf, ".quota") + pvName := filepath.Base(mp) + wanted[pvExportFile(pvName)] = true + if err := WriteExport(mp); err != nil { + log.Printf("mount-all: write export for %s: %v", mp, err) + } + } + + existing, err := filepath.Glob(filepath.Join(exportsConfigDir, "*.exports")) + if err != nil { + return nil // directory may not exist yet; WriteExport would have created it + } + for _, e := range existing { + if !wanted[e] { + if err := os.Remove(e); err != nil { + log.Printf("mount-all: remove stale export %s: %v", e, err) + } + } + } + return nil +} diff --git a/applications/nfsserver/nfsvol/internal/mount/mount.go b/applications/nfsserver/nfsvol/internal/mount/mount.go new file mode 100644 index 000000000..86aba265f --- /dev/null +++ b/applications/nfsserver/nfsvol/internal/mount/mount.go @@ -0,0 +1,241 @@ +package mount + +import ( + "fmt" + "log" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "sync" + + "golang.org/x/sys/unix" + + "metacell/nfsvol/internal/loop" +) + +// IsMounted reports whether path is a mountpoint by comparing its device ID to +// its parent's. Returns false if path does not exist. +func IsMounted(path string) bool { + var st, stParent unix.Stat_t + if err := unix.Stat(path, &st); err != nil { + return false + } + if err := unix.Stat(filepath.Dir(path), &stParent); err != nil { + return false + } + return st.Dev != stParent.Dev +} + +// mountOne mounts the quota file for mountpoint. Idempotent: skips if already mounted. +func mountOne(mountpoint string) error { + quotaFile := mountpoint + ".quota" + + if IsMounted(mountpoint) { + return nil + } + + if err := os.MkdirAll(mountpoint, 0777); err != nil { + return fmt.Errorf("mkdir %s: %w", mountpoint, err) + } + + // Detach any stale loop device still associated with this backing file. + _ = loop.DetachByBacking(quotaFile) + + loopPath, err := loop.Attach(quotaFile) + if err != nil { + return fmt.Errorf("attach %s: %w", quotaFile, err) + } + + if err := unix.Mount(loopPath, mountpoint, "ext4", 0, ""); err != nil { + _ = loop.Detach(loopPath) + return fmt.Errorf("mount %s → %s: %w", loopPath, mountpoint, err) + } + + return os.Chmod(mountpoint, 0777) +} + +// MountAll mounts all *.quota files under exportsDir concurrently. +// It first cleans stale loop devices, then enumerates backing files and mounts +// each in a bounded worker pool. Logs individual failures but returns an aggregate +// error only if any mounts fail. +func MountAll(exportsDir string) error { + loop.CleanStale() + + quotaFiles, err := filepath.Glob(filepath.Join(exportsDir, "*.quota")) + if err != nil { + return err + } + if len(quotaFiles) == 0 { + log.Println("mount-all: no quota files found") + // Still regenerate /etc/exports.d/ so any stale fragments from a + // previous incarnation are cleaned up. + if err := RegenerateExportsFromQuotas(exportsDir); err != nil { + log.Printf("mount-all: regenerate exports: %v", err) + } + return nil + } + + // Detach ALL loop devices pointing to quota files in a single O(loop_count) + // pass. This clears stale devices accumulated from prior crash-loop restarts + // without the O(loop_count × quota_count) cost of per-file DetachByBacking. + log.Printf("mount-all: cleaning stale loop devices for %d quota files", len(quotaFiles)) + loop.CleanByFiles(quotaFiles) + + log.Printf("mount-all: mounting %d volumes", len(quotaFiles)) + + workers := runtime.NumCPU() * 2 + if workers > 32 { + workers = 32 + } + if workers < 1 { + workers = 1 + } + + type result struct { + path string + err error + } + + jobs := make(chan string, len(quotaFiles)) + results := make(chan result, len(quotaFiles)) + + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for qf := range jobs { + mp := strings.TrimSuffix(qf, ".quota") + err := mountOne(mp) + results <- result{mp, err} + } + }() + } + + for _, qf := range quotaFiles { + jobs <- qf + } + close(jobs) + + wg.Wait() + close(results) + + var failed int + for r := range results { + if r.err != nil { + log.Printf("mount-all: FAILED %s: %v", r.path, r.err) + failed++ + } else { + log.Printf("mount-all: OK %s", r.path) + } + } + // Regenerate exports.d deterministically from the quota files we just + // processed. This provides stable fsids even after a node reschedule. + // run_nfs.sh will call `exportfs -r` a few lines later, so no explicit + // reload needed here. + if err := RegenerateExportsFromQuotas(exportsDir); err != nil { + log.Printf("mount-all: regenerate exports: %v", err) + } + + if failed > 0 { + return fmt.Errorf("%d of %d mounts failed", failed, len(quotaFiles)) + } + return nil +} + +// Create makes a new quota-backed directory: creates the sparse backing file, +// formats it ext4, mounts it, and sets permissions. +func Create(mountpoint string, sizeBytes int64) error { + quotaFile := mountpoint + ".quota" + + // Clean up any pre-existing state left from a failed previous attempt. + _ = Delete(mountpoint) + + if err := createQuotaFile(quotaFile, sizeBytes); err != nil { + return err + } + + cmd := exec.Command("mkfs.ext4", "-F", quotaFile) + if out, err := cmd.CombinedOutput(); err != nil { + _ = os.Remove(quotaFile) + return fmt.Errorf("mkfs.ext4: %w\n%s", err, out) + } + + if err := mountOne(mountpoint); err != nil { + return err + } + if err := WriteExport(mountpoint); err != nil { + return fmt.Errorf("write export for %s: %w", mountpoint, err) + } + return ReloadExports() +} + +// Delete unmounts the loop-backed directory and renames the quota file to the +// mountpoint path (matching rmlimdir.sh behavior: caller decides final disposition). +func Delete(mountpoint string) error { + quotaFile := mountpoint + ".quota" + + if IsMounted(mountpoint) { + // MNT_DETACH = lazy unmount: detaches from the filesystem namespace + // immediately while still letting active users finish. + if err := unix.Unmount(mountpoint, unix.MNT_DETACH); err != nil { + log.Printf("delete: unmount %s: %v (continuing)", mountpoint, err) + } + } + + _ = loop.DetachByBacking(quotaFile) + _ = os.RemoveAll(mountpoint) + + // Rename quota file to the mountpoint path so the raw ext4 data is preserved + // as a regular file. The provisioner may then archive or remove it. + if _, err := os.Stat(quotaFile); err == nil { + if err := os.Rename(quotaFile, mountpoint); err != nil { + log.Printf("delete: rename %s → %s: %v", quotaFile, mountpoint, err) + } + } + + if err := RemoveExport(mountpoint); err != nil { + log.Printf("delete: remove export for %s: %v", mountpoint, err) + } + if err := ReloadExports(); err != nil { + log.Printf("delete: exportfs -r: %v", err) + } + return nil +} + +// RemountIfStale checks a single mountpoint and remounts it if stale. +// It skips paths whose backing quota file no longer exists. +func RemountIfStale(mountpoint string) error { + quotaFile := mountpoint + ".quota" + if _, err := os.Stat(quotaFile); err != nil { + return nil // quota file gone, nothing to do + } + if IsMounted(mountpoint) { + return nil + } + log.Printf("watchdog: stale mount at %s, remounting", mountpoint) + if err := mountOne(mountpoint); err != nil { + return err + } + // Re-assert the exports fragment — the fsid is deterministic from the + // PV name, so clients will keep using the same file handles; but we + // refresh the fragment in case /etc/exports.d/ lost it somehow. + if err := WriteExport(mountpoint); err != nil { + log.Printf("watchdog: write export for %s: %v", mountpoint, err) + } + return ReloadExports() +} + +func createQuotaFile(path string, size int64) error { + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create %s: %w", path, err) + } + defer f.Close() + if err := f.Truncate(size); err != nil { + return fmt.Errorf("truncate %s to %d: %w", path, size, err) + } + return nil +} diff --git a/applications/nfsserver/nfsvol/internal/watchdog/watchdog.go b/applications/nfsserver/nfsvol/internal/watchdog/watchdog.go new file mode 100644 index 000000000..3e9858957 --- /dev/null +++ b/applications/nfsserver/nfsvol/internal/watchdog/watchdog.go @@ -0,0 +1,131 @@ +package watchdog + +import ( + "context" + "fmt" + "log" + "net/http" + "os" + "os/signal" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "syscall" + "time" + + "metacell/nfsvol/internal/mount" +) + +const checkWorkers = 32 + +// Run starts the mount health watchdog and HTTP endpoints. +// It blocks until SIGTERM or SIGINT is received. +// +// If mountFirst is true, MountAll is run synchronously before the watch +// loop starts. The HTTP server is up throughout so the liveness probe is +// satisfied immediately; the readiness probe (/ready) only returns 200 once +// MountAll completes. +func Run(exportsDir string, intervalSecs int, addr string, mountFirst bool) { + var ready atomic.Int32 // 0 = starting, 1 = ready + + mux := http.NewServeMux() + + // /healthz: liveness — always 200 while the process is running. + mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + fmt.Fprintln(w, "ok") + }) + + // /ready: readiness — 200 only after mount-all completes. + mux.HandleFunc("/ready", func(w http.ResponseWriter, r *http.Request) { + if ready.Load() == 1 { + w.WriteHeader(http.StatusOK) + fmt.Fprintln(w, "ready") + } else { + w.WriteHeader(http.StatusServiceUnavailable) + fmt.Fprintln(w, "starting") + } + }) + + srv := &http.Server{Addr: addr, Handler: mux} + go func() { + log.Printf("watchdog: listening on %s", addr) + if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Printf("watchdog: http server: %v", err) + } + }() + + if mountFirst { + log.Printf("watchdog: running mount-all") + if err := mount.MountAll(exportsDir); err != nil { + log.Printf("watchdog: mount-all failed: %v", err) + } + } + ready.Store(1) + + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGTERM, syscall.SIGINT) + defer stop() + + ticker := time.NewTicker(time.Duration(intervalSecs) * time.Second) + defer ticker.Stop() + + log.Printf("watchdog: started, interval=%ds, exports=%s", intervalSecs, exportsDir) + + for { + select { + case <-ctx.Done(): + _ = srv.Shutdown(context.Background()) + return + case <-ticker.C: + checkAll(exportsDir) + } + } +} + +// checkAll verifies and repairs all expected mountpoints via a bounded worker pool. +func checkAll(exportsDir string) bool { + quotaFiles, err := filepath.Glob(filepath.Join(exportsDir, "*.quota")) + if err != nil || len(quotaFiles) == 0 { + return true + } + + type job struct{ mountpoint string } + jobs := make(chan job, len(quotaFiles)) + for _, qf := range quotaFiles { + jobs <- job{strings.TrimSuffix(qf, ".quota")} + } + close(jobs) + + var wg sync.WaitGroup + var failures atomic.Int32 + + workers := checkWorkers + if workers > len(quotaFiles) { + workers = len(quotaFiles) + } + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := range jobs { + if err := mount.RemountIfStale(j.mountpoint); err != nil { + log.Printf("watchdog: remount failed for %s: %v", j.mountpoint, err) + failures.Add(1) + } + } + }() + } + wg.Wait() + + if n := failures.Load(); n > 0 { + log.Printf("watchdog: %d/%d mounts still unhealthy", n, len(quotaFiles)) + return false + } + + if _, err := os.Stat(exportsDir); err != nil { + log.Printf("watchdog: exports dir inaccessible: %v", err) + return false + } + return true +} diff --git a/applications/nfsserver/nfsvol/main.go b/applications/nfsserver/nfsvol/main.go new file mode 100644 index 000000000..ee9220ef1 --- /dev/null +++ b/applications/nfsserver/nfsvol/main.go @@ -0,0 +1,90 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + + "metacell/nfsvol/internal/mount" + "metacell/nfsvol/internal/watchdog" +) + +const exportsDir = "/exports" + +func main() { + log.SetFlags(log.Ltime | log.Lmicroseconds) + + if len(os.Args) < 2 { + usage() + os.Exit(1) + } + + switch os.Args[1] { + case "mount-all": + if err := mount.MountAll(exportsDir); err != nil { + log.Fatalf("mount-all: %v", err) + } + + case "create": + fs := flag.NewFlagSet("create", flag.ExitOnError) + mountpoint := fs.String("m", "", "mountpoint path (required)") + size := fs.Int64("s", 0, "size in bytes (required)") + _ = fs.Parse(os.Args[2:]) + if *mountpoint == "" || *size == 0 { + fmt.Fprintln(os.Stderr, "create: -m and -s are required") + os.Exit(1) + } + if err := mount.Create(*mountpoint, *size); err != nil { + log.Fatalf("create %s: %v", *mountpoint, err) + } + + case "delete": + fs := flag.NewFlagSet("delete", flag.ExitOnError) + mountpoint := fs.String("m", "", "mountpoint path (required)") + _ = fs.Parse(os.Args[2:]) + if *mountpoint == "" { + fmt.Fprintln(os.Stderr, "delete: -m is required") + os.Exit(1) + } + if err := mount.Delete(*mountpoint); err != nil { + log.Fatalf("delete %s: %v", *mountpoint, err) + } + + case "watchdog": + fs := flag.NewFlagSet("watchdog", flag.ExitOnError) + interval := fs.Int("interval", 30, "check interval in seconds") + addr := fs.String("addr", ":8080", "healthz listen address") + mountFirst := fs.Bool("mount-first", false, "run mount-all before starting the watch loop") + _ = fs.Parse(os.Args[2:]) + watchdog.Run(exportsDir, *interval, *addr, *mountFirst) + + default: + usage() + os.Exit(1) + } +} + +func usage() { + fmt.Fprintf(os.Stderr, `Usage: nfsvol [flags] + +Commands: + mount-all + Mount all *.quota files under /exports in parallel (bootstrap). + + create -m -s + Create and mount a new quota-backed directory. + + delete -m + Unmount and clean up a quota-backed directory. + + watchdog [-interval ] [-addr ] [-mount-first] + Long-running health monitor that remounts stale volumes and exposes + /healthz (liveness, always 200) and /ready (readiness, 200 once + mount-all completes) on the given address (default :8080). + -mount-first: run mount-all before starting the watch loop. Use this + in run_nfs.sh instead of a separate "nfsvol mount-all" call so the + HTTP server is live from the start and the liveness probe is always + satisfied during startup. +`) +} diff --git a/applications/nfsserver/resources/mklimdir.sh b/applications/nfsserver/resources/mklimdir.sh index a0381b49e..f9c099b25 100644 --- a/applications/nfsserver/resources/mklimdir.sh +++ b/applications/nfsserver/resources/mklimdir.sh @@ -1,115 +1,30 @@ #!/usr/bin/env bash -# Author: Zoran Sinnema -# Date: 14-11-2022 - - -# example of resizing a mounted pvc: -# dd if=/dev/zero bs=1MiB of=/export/ conv=notrunc oflag=append count= +# Thin wrapper — implementation moved to nfsvol create. +# Kept for backward compatibility with any external callers. +# +# Original flags: -m -s [--mountonly] +# nfsvol create does not support --mountonly (mount-all handles that path). set -e -print_usage(){ - -cat < -s - --m directory --s size in bytes --mo mount only --h this message - -Exit statuses: -0: -1: Invalid option -2: Missing argument -3: No args -4: root privillege required -EOF -} > /dev/stderr +mountpoint="" +size="" +mountonly=0 -parse_args(){ - options=$(getopt -l "h,m:,s:,mountonly" -o "hm:s:" -a -- "$@") - eval set -- "$options" - while true - do +while [[ $# -gt 0 ]]; do case "$1" in - -h) - print_usage; exit 0 ;; - -m) - shift; export mountpoint="$1" ;; - -s) - shift; export size="$1" ;; - --mountonly) - export mountonly=1 ;; - --) - shift - break;; + -m) mountpoint="$2"; shift 2 ;; + -s) size="$2"; shift 2 ;; + --mountonly) mountonly=1; shift ;; + --) shift; break ;; + *) shift ;; esac - shift - done -} - -unmount(){ - mountpoint=$1 - echo Unmount ${mountpoint} - for lodev in $(losetup -a|grep ${mountpoint}|awk '{print $1}'|cut -f 1 -d :); do - losetup -d ${lodev}|| true - done -} - -mkmount(){ - mountpoint=$1 - quota_fs=$2 - - unmount "${mountpoint}" - - echo Mounting ${quota_fs} on ${mountpoint} - i=$(losetup -f|cut -f 2 -d p) - lodev=/dev/loop${i} - - mknod -m666 ${lodev} b 7 ${i} 2>/dev/null || true - losetup -P ${lodev} ${quota_fs} - - rm -rf ${mountpoint} - mkdir ${mountpoint} - # chmod go+w ${mountpoint} - chmod 777 ${mountpoint} - mount ${lodev} ${mountpoint} - # chmod go+w ${mountpoint} - chmod 777 ${mountpoint} -} - -mklimfile(){ - quota_fs=$1 - size=$2 - - count_mb=$(( ${size}/1024/1024 + 1)) - - echo Create file ${quota_fs} size ${count_mb}Mb - truncate -s ${size} ${quota_fs} - #dd if=/dev/zero of=${quota_fs} bs=1M count=${count_mb} - yes | mkfs.ext4 ${quota_fs} -} - -main(){ - if [ $EUID -ne 0 ]; then - echo ">>> Please run the script with sudo/as root" > /dev/stderr - exit 4 - fi - - local mountpoint="" - local size=0 - local mountonly= - - parse_args "$@" - quota_fs=${mountpoint}.quota +done - if [ -z ${mountonly} ]; then - # cleanup before creating the folder and quota file - bash -c "rmlimdir.sh -m ${mountpoint}" || true - mklimfile "${quota_fs}" "${size}" - fi - mkmount "${mountpoint}" "${quota_fs}" -} +if [[ $mountonly -eq 1 ]]; then + # mount-only path: let mount-all handle this, but support single-path invocation + # by running nfsvol mount-all and letting it skip already-mounted volumes. + exec /usr/local/bin/nfsvol mount-all +fi -main "$@" +exec /usr/local/bin/nfsvol create -m "${mountpoint}" -s "${size}" diff --git a/applications/nfsserver/resources/pre-startup.sh b/applications/nfsserver/resources/pre-startup.sh index c83b4534d..26517add6 100644 --- a/applications/nfsserver/resources/pre-startup.sh +++ b/applications/nfsserver/resources/pre-startup.sh @@ -1,3 +1,4 @@ #!/bin/bash - -source /usr/local/bin/remount.sh +# Thin wrapper — kept for backward compatibility. Bootstrap is now handled +# by run_nfs.sh calling nfsvol mount-all directly. +exec /usr/local/bin/nfsvol mount-all diff --git a/applications/nfsserver/resources/remount.sh b/applications/nfsserver/resources/remount.sh index 19dfbd104..ea9c0922c 100644 --- a/applications/nfsserver/resources/remount.sh +++ b/applications/nfsserver/resources/remount.sh @@ -1,14 +1,4 @@ #!/bin/bash - -# remount -losetup -D -for lodev in `losetup -a|grep deleted|awk '{print $1}'|cut -f 1 -d :` -do - losetup -d ${lodev} -done - -for qf in `ls /exports/*.quota` -do - mountpoint=${qf%.*} - mklimdir.sh -m ${mountpoint} --mountonly -done +# Thin wrapper — implementation moved to nfsvol mount-all. +# Kept for backward compatibility with any external callers. +exec /usr/local/bin/nfsvol mount-all diff --git a/applications/nfsserver/resources/rmlimdir.sh b/applications/nfsserver/resources/rmlimdir.sh index 22749c5b6..379e812fd 100644 --- a/applications/nfsserver/resources/rmlimdir.sh +++ b/applications/nfsserver/resources/rmlimdir.sh @@ -1,69 +1,17 @@ #!/usr/bin/env bash -# Author: Zoran Sinnema -# Date: 14-11-2022 +# Thin wrapper — implementation moved to nfsvol delete. +# Kept for backward compatibility with any external callers. -print_usage(){ +set -e -cat < +mountpoint="" --m directory --h this message +while [[ $# -gt 0 ]]; do + case "$1" in + -m) mountpoint="$2"; shift 2 ;; + --) shift; break ;; + *) shift ;; + esac +done -Exit statuses: -0: -1: Invalid option -2: Missing argument -3: No args -4: root privillege required -EOF -} > /dev/stderr - -parse_args(){ - set -x - - option_handler(){ - - case ${opt} in - m) mountpoint=${OPTARG} ;; - h) print_usage; exit 0 ;; - \?) echo ">>>Invalid option: -$OPTARG" > /dev/stderr; exit 1;; - \:) echo ">>>Missing argument to -${OPTARG}" > /dev/stderr; exit 2;; - esac - } - - local OPTIND opt - getopts "m:s:f:h" opt || { echo "No args passed">/dev/stderr;print_usage;exit 3;} - option_handler - while getopts "m:h" opt; do - option_handler - done - shift $((OPTIND-1)) - -} - - -main(){ - if [ $EUID -ne 0 ]; then - echo ">>> Please run the script with sudo/as root" > /dev/stderr - exit 4 - fi - - local mountpoint="" - - parse_args "$@" - quota_fs=${mountpoint}.quota - mountname=`basename ${mountpoint}` - - # find the loop back device and delete/unmount it - lodev=`losetup -a | grep ${mountname} | awk '{print $1}' | cut -f 1 -d :` || true - - losetup -d ${lodev} || true - umount -df ${mountpoint} || true - - rm -rf ${mountpoint} || true - mv ${quota_fs} ${mountpoint} || true - -} - -main "$@" +exec /usr/local/bin/nfsvol delete -m "${mountpoint}" diff --git a/applications/nfsserver/resources/run_nfs.sh b/applications/nfsserver/resources/run_nfs.sh index d884b2dd3..84b2f3c2c 100644 --- a/applications/nfsserver/resources/run_nfs.sh +++ b/applications/nfsserver/resources/run_nfs.sh @@ -16,8 +16,11 @@ function start() { - # run pre startup script - bash -c "/usr/local/bin/pre-startup.sh" + # Start the HTTP liveness server immediately, then run mount-all inside the + # watchdog process. /healthz returns 200 at once (liveness probe satisfied); + # /ready returns 200 only once mount-all finishes (readiness gate). + /usr/local/bin/nfsvol watchdog -mount-first & + bash -c "/usr/local/bin/start_provisioner.sh&" unset gid @@ -36,14 +39,17 @@ function start() /usr/sbin/rpcbind -w fi - mount -t nfsd nfds /proc/fs/nfsd + mount -t nfsd nfsd /proc/fs/nfsd - # -V 3: enable NFSv3 - /usr/sbin/rpc.mountd -N 2 -V 3 -V 4 + # rpc.mountd: no -V flag — let it register all mount protocol versions (1,2,3). + # Adding -V 3 here incorrectly restricts registration to version 1 only, + # which causes "Permission denied" on NFSv3 client mounts. + /usr/sbin/rpc.mountd /usr/sbin/exportfs -r - # -G 10 to reduce grace time to 10 seconds (the lowest allowed) - /usr/sbin/rpc.nfsd -G 10 -N 2 -V 3 -V 4 + # -G 10 to reduce grace time to 10 seconds (the lowest allowed). + # -V 3: enable NFSv3 (matches client mount options). + /usr/sbin/rpc.nfsd -G 10 -V 3 /usr/sbin/rpc.statd --no-notify echo "NFS started" } @@ -56,13 +62,33 @@ function stop() /usr/sbin/exportfs -au /usr/sbin/exportfs -f - kill $( pidof rpc.mountd ) + kill $( pidof rpc.mountd ) 2>/dev/null || true umount /proc/fs/nfsd + + # Lazy-unmount all loop-backed exports before exiting. The pod shares the + # host mount namespace, so any mount left alive here persists after the + # container dies. The next pod's LOOP_CLR_FD then fails with EBUSY and + # cannot reuse the loop device. + for mp in /exports/*/; do + umount -l "$mp" 2>/dev/null || true + done + echo > /etc/exports exit 0 } +# rpc.statd has issues with very high ulimits +ulimit -n 65535 + +# Each loop device creates inotify watches inside the container. On deployments +# with thousands of PVCs the kernel default (8192-12288) is exhausted, which +# causes rpc.mountd to fail with "No space left on device". Write directly to +# /proc/sys rather than using sysctl(8), which is not installed in this image. +# The pod is privileged so the write is permitted. +echo 1048576 > /proc/sys/fs/inotify/max_user_watches 2>/dev/null || true +echo 8192 > /proc/sys/fs/inotify/max_user_instances 2>/dev/null || true + trap stop TERM start "$@" diff --git a/applications/nfsserver/test/failover-test.sh b/applications/nfsserver/test/failover-test.sh new file mode 100755 index 000000000..25c290a6b --- /dev/null +++ b/applications/nfsserver/test/failover-test.sh @@ -0,0 +1,435 @@ +#!/usr/bin/env bash +# +# Production failover / edge-case test for the cloud-harness NFS server. +# +# Scenarios covered +# ----------------- +# +# 1. RWX PVC provisioning +# Create a ReadWriteMany PVC against the -nfs-client storage +# class and wait for Bound. Exercises the provisioner end-to-end: loopback +# ext4 creation, mount, and per-PV entry in /etc/exports.d/. +# +# 2. Multi-node mount via pod anti-affinity +# Spin up two writer pods with `podAntiAffinity` on +# `topologyKey: kubernetes.io/hostname` so they land on different nodes, +# both mounting the same PVC. Both writers are also kept off the nfs-server +# node: in GKE the containerized mounter runs from the host network +# namespace and conflicts with the server's rpcbind on port 111, causing +# the mount to time out when client and server share a node. +# Proves the NFS server accepts simultaneous mounts from two distinct hosts. +# +# 3. Cross-node read/write visibility +# Each pod writes a distinct file; the peer reads it back after a short +# attribute-cache delay. Proves cache coherency across the ClusterIP +# service hop. +# +# 4. fsid stability across NFS server pod restart ***critical*** +# Delete the running nfs-server pod while clients have live mounts, +# wait for Recreate to schedule a new pod, then verify clients can still +# read pre-restart data without remounting. Proves the `Change A` +# stable-fsid design (/etc/exports.d/.exports with SHA-256-derived +# fsid) prevents ESTALE on the server-side failover path. Also writes a +# new file post-restart and checks it propagates. +# +# 5. Watchdog recovery from stale loop device +# Inside the server pod, forcibly `losetup -d` the loop device backing +# the test PVC. Waits ~45 s and verifies the watchdog (30 s interval) +# has remounted and clients still see the data. Proves intra-pod +# stale-mount recovery. +# +# 6. Concurrent writes from both nodes (documented race tolerance) +# Both pods `echo >> /data/concurrent.log` in parallel. Under +# `nolock,local_lock=all` soft semantics interleaving is accepted but no +# write should disappear entirely; the check asserts ≥50 lines land. +# Documents the tradeoff, it does not prevent the race. +# +# 7. PVC delete cleanup +# Delete the PVC and verify the /etc/exports.d/.exports fragment is +# removed on the server. Proves the provisioner's Delete path correctly +# invokes `nfsvol delete` (which removes the fragment and runs +# `exportfs -r`). +# +# What is NOT covered +# ------------------- +# +# - Ungraceful node loss (the ~6 min force-detach path): impossible to +# simulate reliably in CI without kicking a real node out of the cluster. +# Documented in README-PROD.md as an inherent limitation. +# - Backup / disaster recovery: operator responsibility. +# - HA / active-passive: out of scope for this architecture. +# +# Requirements +# ------------ +# +# - kubectl on PATH, authenticated to the test cluster +# - at least 3 schedulable nodes (test skips gracefully otherwise; 1 for +# nfs-server, 2 for writer pods which must not share the server's node) +# - NAMESPACE env var pointing at the deployed namespace +# - STORAGE_CLASS env var (defaults to "-nfs-client") +# +# The script cleans up all fixtures on EXIT (including on failure) via trap. +# +# Example: +# NAMESPACE=test-ch bash ./failover-test.sh 2>&1 + +set -u -o pipefail + +NAMESPACE="${NAMESPACE:?NAMESPACE env var is required}" +STORAGE_CLASS="${STORAGE_CLASS:-${NAMESPACE}-nfs-client}" +PREFIX="nfs-failover-test" +PVC="${PREFIX}-pvc" +WRITER_A="${PREFIX}-writer-a" +WRITER_B="${PREFIX}-writer-b" +WRITER_C="${PREFIX}-writer-c" + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } +pass() { printf '[%s] PASS: %s\n' "$(date +%H:%M:%S)" "$*" >&2; } +fail() { printf '[%s] FAIL: %s\n' "$(date +%H:%M:%S)" "$*" >&2; exit 1; } + +k() { kubectl -n "$NAMESPACE" "$@"; } + +cleanup() { + log "cleanup: removing test fixtures" + k delete pod "$WRITER_A" "$WRITER_B" "$WRITER_C" --ignore-not-found --grace-period=0 --force --wait=false 2>/dev/null || true + k delete pvc "$PVC" --ignore-not-found --wait=false 2>/dev/null || true +} +trap cleanup EXIT + +wait_for() { + # wait_for ; returns 0 on success, 1 on timeout + local timeout=$1; shift + local end=$(( $(date +%s) + timeout )) + while [ "$(date +%s)" -lt "$end" ]; do + if "$@" >/dev/null 2>&1; then return 0; fi + sleep 2 + done + return 1 +} + +# ----------------------------------------------------------------------------- +log "preflight" +# ----------------------------------------------------------------------------- + +NODE_COUNT=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') +if [ "${NODE_COUNT:-0}" -lt 3 ]; then + log "SKIP: need ≥3 nodes, found ${NODE_COUNT}" + exit 0 +fi +log "preflight: $NODE_COUNT nodes available" + +if ! k get storageclass "$STORAGE_CLASS" >/dev/null 2>&1 && \ + ! kubectl get storageclass "$STORAGE_CLASS" >/dev/null 2>&1; then + fail "storage class $STORAGE_CLASS not found" +fi + +k rollout status deploy/nfs-server --timeout=180s || fail "nfs-server not Ready" +INITIAL_NFS_POD=$(k get pod -l app=nfs-server -o jsonpath='{.items[0].metadata.name}') +log "initial nfs-server pod: $INITIAL_NFS_POD" + +# ----------------------------------------------------------------------------- +log "test 1: provision RWX PVC via $STORAGE_CLASS" +# ----------------------------------------------------------------------------- + +cat </dev/null)\" = Bound ]" \ + || fail "PVC did not Bind within 60s" +PV_NAME=$(k get pvc "$PVC" -o jsonpath='{.spec.volumeName}') +pass "PVC bound to $PV_NAME" + +# ----------------------------------------------------------------------------- +log "test 2: mount same PVC on two nodes via pod anti-affinity" +# ----------------------------------------------------------------------------- + +for pod in "$WRITER_A" "$WRITER_B"; do + cat < /data/from-a.txt && sync" || fail "write from A failed" +k exec "$WRITER_B" -- sh -c "echo hello-from-B > /data/from-b.txt && sync" || fail "write from B failed" +sleep 2 # NFS attribute cache + +got=$(k exec "$WRITER_B" -- cat /data/from-a.txt 2>/dev/null || echo MISSING) +[ "$got" = "hello-from-A" ] || fail "B cannot read A's file: got '$got'" + +got=$(k exec "$WRITER_A" -- cat /data/from-b.txt 2>/dev/null || echo MISSING) +[ "$got" = "hello-from-B" ] || fail "A cannot read B's file: got '$got'" +pass "cross-node r/w works" + +# ----------------------------------------------------------------------------- +log "test 4: fsid stability across nfs-server restart" +# ----------------------------------------------------------------------------- + +log "deleting nfs-server pod $INITIAL_NFS_POD" +k delete pod "$INITIAL_NFS_POD" --wait=false + +wait_for 180 sh -c " + cur=\$(kubectl -n $NAMESPACE get pod -l app=nfs-server -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + [ -n \"\$cur\" ] && [ \"\$cur\" != '$INITIAL_NFS_POD' ] && + kubectl -n $NAMESPACE get pod \$cur -o jsonpath='{.status.conditions[?(@.type==\"Ready\")].status}' 2>/dev/null | grep -q True +" || fail "new nfs-server pod did not become Ready in 180s" + +NEW_NFS_POD=$(k get pod -l app=nfs-server -o jsonpath='{.items[0].metadata.name}') +log "new nfs-server pod: $NEW_NFS_POD" + +# After Ready, give NFS userland (rpcbind, rpc.nfsd, exportfs) a moment to be +# fully serving — the readiness probe only checks the watchdog's healthz. +sleep 15 + +# Under stable-fsid clients keep reading the same file handles; with unstable +# fsids the read after restart would return ESTALE permanently. +# +# Each attempt: ls to force the NFS client to revalidate the directory handle +# (which also clears any cached stale child handles), then cat to read. +# Both calls are time-bounded so a hard-mount hang during the recovery +# window (~30 s) does not stall the loop. +verify_read() { + local pod=$1 file=$2 expected=$3 + local dir + dir=$(dirname "$file") + local attempt=0 + local got + while [ $attempt -lt 30 ]; do + timeout 10 kubectl -n "$NAMESPACE" exec "$pod" -- ls "$dir" >/dev/null 2>&1 || true + got=$(timeout 15 kubectl -n "$NAMESPACE" exec "$pod" -- cat "$file" 2>/dev/null || true) + if [ "$got" = "$expected" ]; then + return 0 + fi + sleep 3 + attempt=$((attempt + 1)) + done + return 1 +} + +verify_read "$WRITER_A" /data/from-a.txt hello-from-A || fail "A lost its own file (ESTALE — fsid unstable)" +verify_read "$WRITER_B" /data/from-b.txt hello-from-B || fail "B lost its own file (ESTALE — fsid unstable)" +verify_read "$WRITER_B" /data/from-a.txt hello-from-A || fail "B lost A's file after restart (ESTALE)" + +# Verify writes still propagate post-restart +k exec "$WRITER_A" -- sh -c "echo post-restart > /data/post-restart.txt && sync" || fail "post-restart write failed" +verify_read "$WRITER_B" /data/post-restart.txt post-restart || fail "post-restart write did not propagate" +pass "fsid stable across server pod restart -- no ESTALE" + +# ----------------------------------------------------------------------------- +log "test 4b: inotify watch limit raised in server pod" +# ----------------------------------------------------------------------------- +# Each loop device and NFS client tracked by rpc.mountd consumes inotify +# watches. The kernel default (~8192-12288) is exhausted on clusters with +# thousands of PVCs, causing rpc.mountd to fail with ENOSPC. run_nfs.sh must +# raise this via sysctl before starting mountd. + +watches=$(k exec "$NEW_NFS_POD" -- cat /proc/sys/fs/inotify/max_user_watches 2>/dev/null | tr -d '[:space:]' || echo 0) +if [ "${watches:-0}" -lt 524288 ]; then + fail "inotify max_user_watches=$watches is too low -- rpc.mountd will exhaust watches on large clusters (need >=524288)" +fi +pass "inotify max_user_watches=$watches (>=524288)" + +# ----------------------------------------------------------------------------- +log "test 4c: rpc.mountd running and MOUNT protocol v1/v2/v3 all registered" +# ----------------------------------------------------------------------------- +# Passing '-V 3' to rpc.mountd on this kernel incorrectly restricts MOUNT +# protocol registration to version 1 only, causing NFSv3 clients to receive +# "Permission denied" when mounting. Verify all three versions are present. + +k exec "$NEW_NFS_POD" -- sh -c \ + "ls /proc/[0-9]*/exe 2>/dev/null | xargs readlink 2>/dev/null | grep -q 'rpc\.mountd'" \ + || fail "rpc.mountd is not running in $NEW_NFS_POD (check run_nfs.sh / inotify limit)" +pass "rpc.mountd process is running" + +for ver in 1 2 3; do + k exec "$NEW_NFS_POD" -- sh -c \ + "/usr/sbin/rpcinfo -p 2>/dev/null | awk -v v=$ver '\$1==100005 && \$2==v' | grep -q ." \ + || fail "MOUNT protocol version $ver not registered -- run rpc.mountd without -V flag" +done +pass "MOUNT protocol versions 1, 2, 3 all registered in portmapper" + +# ----------------------------------------------------------------------------- +log "test 4d: loop device count bounded after restart (CleanByFiles coverage)" +# ----------------------------------------------------------------------------- +# After crash-loop restarts the kernel retains loop devices whose backing files +# are still present (not marked deleted). CleanByFiles in mount-all must detach +# them in a single O(loop_count) pass before remounting. Verify the residual +# count is <= number of quota files plus a small tolerance. + +quota_count=$(k exec "$NEW_NFS_POD" -- sh -c 'ls /exports/*.quota 2>/dev/null | wc -l' | tr -d '[:space:]' || echo 0) +loop_count=$(k exec "$NEW_NFS_POD" -- sh -c 'losetup -a 2>/dev/null | wc -l' | tr -d '[:space:]' || echo 0) +tolerance=5 +max_allowed=$(( quota_count + tolerance )) +if [ "${loop_count:-0}" -gt "$max_allowed" ]; then + fail "loop device count $loop_count exceeds quota count $quota_count + tolerance $tolerance -- stale cleanup (CleanByFiles) may not have run" +fi +pass "loop device count $loop_count <= quota count $quota_count + $tolerance (no stale accumulation)" + +# ----------------------------------------------------------------------------- +log "test 4e: new client mount after server restart (rpc.mountd availability)" +# ----------------------------------------------------------------------------- +# Spin up a fresh pod AFTER the server has restarted. This exercises the +# MOUNT protocol path (new mount negotiation) rather than just file-handle +# reuse by existing clients. Catches the case where rpc.mountd is not +# running or not accepting new connections post-restart. + +cat < where pvName includes the PV_NAME. +MOUNTPOINT=$(k exec "$NEW_NFS_POD" -- sh -c "ls -d /exports/*${PV_NAME}* 2>/dev/null | grep -v '\.exports$' | grep -v 'archived-' | head -1" || true) +if [ -z "$MOUNTPOINT" ]; then + log "skipping test 5: could not locate server-side mountpoint for $PV_NAME" +else + LOOPDEV=$(k exec "$NEW_NFS_POD" -- sh -c "losetup -a | grep '$MOUNTPOINT.quota' | cut -d: -f1" || true) + if [ -z "$LOOPDEV" ]; then + log "skipping test 5: could not locate loop device for $MOUNTPOINT" + else + log "forcing stale: detach $LOOPDEV (backing $MOUNTPOINT.quota)" + k exec "$NEW_NFS_POD" -- sh -c "losetup -d $LOOPDEV" || true + # Watchdog interval is 30s; allow a margin. + sleep 45 + verify_read "$WRITER_A" /data/from-a.txt hello-from-A \ + || fail "watchdog did not recover from stale loop within 45s" + pass "watchdog recovered stale loop device and clients kept access" + fi +fi + +# ----------------------------------------------------------------------------- +log "test 6: concurrent writes from both nodes (documented race tolerance)" +# ----------------------------------------------------------------------------- + +k exec "$WRITER_A" -- sh -c 'for i in $(seq 1 50); do echo "A-$i" >> /data/concurrent.log; done' & +pidA=$! +k exec "$WRITER_B" -- sh -c 'for i in $(seq 1 50); do echo "B-$i" >> /data/concurrent.log; done' & +pidB=$! +wait $pidA $pidB + +count=$(k exec "$WRITER_A" -- sh -c 'wc -l < /data/concurrent.log' | tr -d ' ') +# With local_lock=all on soft NFSv3, interleaving is allowed but no write should +# be completely lost. Expect at least 50 lines (weakest guarantee). +if [ "${count:-0}" -lt 50 ]; then + fail "concurrent writes lost more than half: $count < 50" +fi +pass "concurrent writes produced $count lines (interleaving tolerated)" + +# ----------------------------------------------------------------------------- +log "test 7: PVC delete cleans up /etc/exports.d/ fragment" +# ----------------------------------------------------------------------------- + +k delete pod "$WRITER_A" "$WRITER_B" "$WRITER_C" --ignore-not-found --grace-period=10 --wait=true + +NFS_POD=$(k get pod -l app=nfs-server -o jsonpath='{.items[0].metadata.name}') + +# The fragment file is named .exports where pvName is the +# mountpoint basename (which contains $PV_NAME inside it). Find it by listing. +FRAGMENT=$(k exec "$NFS_POD" -- sh -c "ls /etc/exports.d/ 2>/dev/null" | grep -F "$PV_NAME" | head -1 || true) + +if [ -z "$FRAGMENT" ]; then + log "WARN: no exports fragment for $PV_NAME before PVC delete — skipping test 7" +else + log "fragment found: $FRAGMENT — deleting PVC and waiting for it to disappear" + k delete pvc "$PVC" --wait=true + # Provisioner delete is async; allow a brief settling window. + fragment_gone() { + ! k exec "$NFS_POD" -- test -f "/etc/exports.d/$FRAGMENT" + } + wait_for 30 fragment_gone \ + || fail "exports fragment $FRAGMENT not removed after PVC delete" + pass "exports fragment $FRAGMENT removed on PVC delete" +fi + +log "ALL TESTS PASSED" diff --git a/applications/samples/deploy/values-minimal.yaml b/applications/samples/deploy/values-minimal.yaml index c6fb239a8..58d06b3e4 100644 --- a/applications/samples/deploy/values-minimal.yaml +++ b/applications/samples/deploy/values-minimal.yaml @@ -1,8 +1,8 @@ harness: secured: false dependencies: - soft: [] - hard: [] + soft: + - common use_services: [] dockerfile: buildArgs: diff --git a/applications/samples/deploy/values-prod.yaml b/applications/samples/deploy/values-prod.yaml index 6a705ff5b..0fa8cc770 100644 --- a/applications/samples/deploy/values-prod.yaml +++ b/applications/samples/deploy/values-prod.yaml @@ -2,4 +2,4 @@ harness: deployment: resources: requests: - memory: "33Mi" \ No newline at end of file + memory: "33Mi" diff --git a/applications/samples/deploy/values-test.yaml b/applications/samples/deploy/values-test.yaml index 606842a0e..0b5e5107b 100644 --- a/applications/samples/deploy/values-test.yaml +++ b/applications/samples/deploy/values-test.yaml @@ -6,6 +6,7 @@ harness: - common - jupyterhub - volumemanager + hard: - nfsserver build: - cloudharness-flask @@ -30,4 +31,6 @@ harness: - role1 realmRoles: - offline_access - + deployment: + volume: + usenfs: true diff --git a/deployment-configuration/codefresh-template-test.yaml b/deployment-configuration/codefresh-template-test.yaml index 8d38dcc60..9099b6bcb 100644 --- a/deployment-configuration/codefresh-template-test.yaml +++ b/deployment-configuration/codefresh-template-test.yaml @@ -76,7 +76,7 @@ steps: kube_context: ${{CLUSTER_NAME}} namespace: test-${{NAMESPACE_BASENAME}} chart_version: ${{CF_SHORT_REVISION}} - cmd_ps: --timeout 600s --create-namespace + cmd_ps: "--timeout 600s --create-namespace" custom_value_files: - ./deployment/helm/values.yaml build_test_images: @@ -91,6 +91,35 @@ steps: commands: - kubectl config use-context ${{CLUSTER_NAME}} - kubectl config set-context --current --namespace=test-${{NAMESPACE_BASENAME}} + tests_nfs_failover: + stage: qa + title: NFS server failover & multi-node tests + image: codefresh/kubectl + working_directory: . + fail_fast: false + commands: + - kubectl config use-context ${{CLUSTER_NAME}} + - export NAMESPACE=test-${{NAMESPACE_BASENAME}} + - export STORAGE_CLASS=${NAMESPACE}-nfs-client + - bash applications/nfsserver/test/failover-test.sh + hooks: + on_finish: + exec: + image: codefresh/kubectl + commands: + - kubectl config use-context ${{CLUSTER_NAME}} + - export NAMESPACE=test-${{NAMESPACE_BASENAME}} + - kubectl -n $NAMESPACE delete pod -l app=nfs-failover-test --ignore-not-found --grace-period=0 --force --wait=false 2>/dev/null || true + - kubectl -n $NAMESPACE delete pvc -l app=nfs-failover-test --ignore-not-found --wait=false 2>/dev/null || true + on_fail: + exec: + image: alpine + commands: + - cf_export FAILED=failed + when: + condition: + all: + nfs_test_enabled: '"${{TEST_NFS_FAILOVER}}" == "true"' tests_api: stage: qa title: Api tests diff --git a/deployment/codefresh-test.yaml b/deployment/codefresh-test.yaml index 91ab4f2c8..21917b7c8 100644 --- a/deployment/codefresh-test.yaml +++ b/deployment/codefresh-test.yaml @@ -13,26 +13,13 @@ steps: repo: ${{CF_REPO_OWNER}}/${{CF_REPO_NAME}} revision: ${{CF_BRANCH}} git: github - post_main_clone: - title: Post main clone - type: parallel - stage: prepare - steps: - clone_cloud_harness: - title: Cloning cloud-harness repository... - type: git-clone - stage: prepare - repo: https://github.com/MetaCell/cloud-harness.git - revision: ${{CLOUDHARNESS_BRANCH}} - working_directory: . - git: github prepare_deployment: title: Prepare helm chart image: python:3.12 stage: prepare working_directory: . commands: - - bash cloud-harness/install.sh + - bash ./install.sh - export HELM_NAME_ARG="$( [ -n "${{CHART_NAME}}" ] && printf -- "--name %s" "${{CHART_NAME}}" )" - export HELM_CHART_VERSION_ARG="$( [ -n "${{CHART_VERSION}}" ] && printf -- "--chart-version @@ -126,6 +113,28 @@ steps: '{{CLOUDHARNESS_FRONTEND_BUILD_TAG_EXISTS}}') == true forceNoCache: includes('${{CLOUDHARNESS_FRONTEND_BUILD_TAG_FORCE_BUILD}}', '{{CLOUDHARNESS_FRONTEND_BUILD_TAG_FORCE_BUILD}}') == false + nfsserver: + type: build + stage: build + dockerfile: Dockerfile + registry: ${{CODEFRESH_REGISTRY}} + buildkit: true + build_arguments: + - NOCACHE=${{CF_BUILD_ID}} + image_name: cloud-harness/nfsserver + title: Nfsserver + working_directory: ./applications/nfsserver + tags: + - ${{NFSSERVER_TAG}} + - ${{DEPLOYMENT_PUBLISH_TAG}}-dev + - ${{CF_BRANCH_TAG_NORMALIZED_LOWER_CASE}} + when: + condition: + any: + buildDoesNotExist: includes('${{NFSSERVER_TAG_EXISTS}}', '{{NFSSERVER_TAG_EXISTS}}') + == true + forceNoCache: includes('${{NFSSERVER_TAG_FORCE_BUILD}}', '{{NFSSERVER_TAG_FORCE_BUILD}}') + == false test-e2e: type: build stage: build @@ -524,12 +533,37 @@ steps: - kubectl config set-context --current --namespace=test-${{NAMESPACE_BASENAME}} - kubectl rollout status deployment/workflows - kubectl rollout status deployment/accounts - - kubectl rollout status deployment/samples - - kubectl rollout status deployment/samples-gk - kubectl rollout status deployment/volumemanager + - kubectl rollout status deployment/samples - kubectl rollout status deployment/common - - kubectl rollout status deployment/argo-gk - sleep 60 + tests_nfs_failover: + stage: qa + title: NFS server failover & multi-node tests + image: codefresh/kubectl + working_directory: . + fail_fast: false + commands: + - kubectl config use-context ${{CLUSTER_NAME}} + - export NAMESPACE=test-${{NAMESPACE_BASENAME}} + - export STORAGE_CLASS=${NAMESPACE}-nfs-client + - bash applications/nfsserver/test/failover-test.sh + hooks: + on_finish: + exec: + image: codefresh/kubectl + commands: + - kubectl config use-context ${{CLUSTER_NAME}} + - export NAMESPACE=test-${{NAMESPACE_BASENAME}} + - kubectl -n $NAMESPACE delete pod -l app=nfs-failover-test --ignore-not-found + --grace-period=0 --force --wait=false 2>/dev/null || true + - kubectl -n $NAMESPACE delete pvc -l app=nfs-failover-test --ignore-not-found + --wait=false 2>/dev/null || true + on_fail: + exec: + image: alpine + commands: + - cf_export FAILED=failed tests_api: stage: qa title: Api tests @@ -549,6 +583,16 @@ steps: commands: - st --pre-run cloudharness_test.apitest_init run api/openapi.yaml --base-url https://workflows.${{DOMAIN}}/api -c all + common_api_test: + title: common api test + volumes: + - ${{CF_REPO_NAME}}/applications/common:/home/test + - ${{CF_REPO_NAME}}/deployment/helm/values.yaml:/opt/cloudharness/resources/allvalues.yaml + environment: + - APP_URL=https://common.${{DOMAIN}}/api + commands: + - st --pre-run cloudharness_test.apitest_init run api/openapi.yaml --base-url + https://common.${{DOMAIN}}/api -c all samples_api_test: title: samples api test volumes: @@ -565,16 +609,6 @@ steps: --hypothesis-suppress-health-check=too_slow --hypothesis-deadline=180000 --request-timeout=180000 --hypothesis-max-examples=2 --show-trace --exclude-checks=ignored_auth - pytest -v test/api - common_api_test: - title: common api test - volumes: - - ${{CF_REPO_NAME}}/applications/common:/home/test - - ${{CF_REPO_NAME}}/deployment/helm/values.yaml:/opt/cloudharness/resources/allvalues.yaml - environment: - - APP_URL=https://common.${{DOMAIN}}/api - commands: - - st --pre-run cloudharness_test.apitest_init run api/openapi.yaml --base-url - https://common.${{DOMAIN}}/api -c all hooks: on_fail: exec: diff --git a/libraries/cloudharness-common/cloudharness/utils/env.py b/libraries/cloudharness-common/cloudharness/utils/env.py index cf3d47b8d..bc06fd4ca 100644 --- a/libraries/cloudharness-common/cloudharness/utils/env.py +++ b/libraries/cloudharness-common/cloudharness/utils/env.py @@ -84,7 +84,7 @@ def get_cloudharness_events_client_id(): def get_cloudharness_events_service(): - return get_service_cluster_address('BOOTSTRAP') + return get_service_cluster_address('bootstrap') def get_service_cluster_address(cloudharness_app_name): diff --git a/tools/deployment-cli-tools/ch_cli_tools/codefresh.py b/tools/deployment-cli-tools/ch_cli_tools/codefresh.py index 5a89ec40b..412167f8f 100644 --- a/tools/deployment-cli-tools/ch_cli_tools/codefresh.py +++ b/tools/deployment-cli-tools/ch_cli_tools/codefresh.py @@ -22,16 +22,26 @@ ROLLOUT_CMD_TPL = "kubectl rollout status deployment/%s" -def _to_codefresh_path(rel_path: str) -> str: - """Rewrite a relative path that escapes the current directory to use ./cloud-harness. +def _to_codefresh_path(path: str) -> str: + """Return the Codefresh-friendly path for any path pointing into the cloud-harness tree. In Codefresh pipelines cloud-harness is always cloned into ./cloud-harness. - Any path of the form '..//...' must become 'cloud-harness/...'. + Resolves *path* to a relative path from CWD, then skips over any leading '..' + components. If the first real directory name after those is 'cloud-harness', + the path is rewritten to start with 'cloud-harness/' — regardless of how many + levels up cloud-harness lives or whether an absolute path was passed. + All other paths are returned unchanged (as their relpath from CWD). """ - parts = rel_path.replace('\\', '/').split('/') - if parts[0] == '..': - return '/'.join([CLOUD_HARNESS_PATH] + parts[2:]) - return rel_path + rel = os.path.relpath(os.path.abspath(path), '.') + parts = rel.replace('\\', '/').split('/') + # Skip all leading '..' components + i = 0 + while i < len(parts) and parts[i] == '..': + i += 1 + # If we crossed at least one '..' and the next component is cloud-harness, rewrite + if i > 0 and i < len(parts) and parts[i] == CLOUD_HARNESS_PATH: + return '/'.join([CLOUD_HARNESS_PATH] + parts[i + 1:]) + return rel # Codefresh variables may need quotes: adjust yaml dump accordingly @@ -459,7 +469,7 @@ def adjust_build_steps(index): cmds[i] = cmds[i].replace("$ENV", "-".join(envs)) cmds[i] = cmds[i].replace("$PARAMS", " ".join(params)) cmds[i] = cmds[i].replace("$PATHS", " ".join( - _to_codefresh_path(os.path.relpath(root_path, '.')) + _to_codefresh_path(root_path) for root_path in root_paths if DEFAULT_MERGE_PATH not in root_path)) steps = codefresh["steps"] diff --git a/tools/deployment-cli-tools/tests/test_codefresh.py b/tools/deployment-cli-tools/tests/test_codefresh.py index 6a5a6af47..73ef7e05c 100644 --- a/tools/deployment-cli-tools/tests/test_codefresh.py +++ b/tools/deployment-cli-tools/tests/test_codefresh.py @@ -668,131 +668,143 @@ def test_env_dockerfile_codefresh_fallback(): def test_codefresh_paths_use_cloned_cloud_harness(): - """When cloud-harness root is outside the current directory (e.g. ../cloud-harness), - paths in the generated codefresh YAML should use ./cloud-harness (the cloned location - inside the pipeline working directory), not ../cloud-harness.""" + """When cloud-harness root is outside the current directory (e.g. ../cloud-harness or + an absolute path), paths in the generated codefresh YAML should use cloud-harness + (the cloned location inside the pipeline working directory), not ../cloud-harness.""" import tempfile - # Create a sibling directory to simulate running from a different project - with tempfile.TemporaryDirectory(dir=os.path.dirname(CLOUDHARNESS_ROOT)) as tmp_project_dir: - old_cwd = os.getcwd() - try: - os.chdir(tmp_project_dir) - - # Sanity check: cloud-harness is indeed above the current directory - assert os.path.relpath(CLOUDHARNESS_ROOT, '.').startswith('..'), \ - "Test setup issue: cloud-harness should be outside the current directory" - - values = create_helm_chart( - [CLOUDHARNESS_ROOT, RESOURCES], - output_path=OUT, - include=['samples'], - domain="my.local", - namespace='test', - env='dev', - local=False, - tag=1, - registry='reg' - ) - - root_paths = preprocess_build_overrides( - root_paths=[CLOUDHARNESS_ROOT, RESOURCES], - helm_values=values, - merge_build_path=BUILD_MERGE_DIR - ) - - build_included = [app['harness']['name'] - for app in values['apps'].values() if 'harness' in app] - - cf = create_codefresh_deployment_scripts(root_paths, include=build_included, - envs=['dev'], - base_image_name=values['name'], - helm_values=values, save=False) - - # harness-deployment command must use ./cloud-harness, not ../cloud-harness - cmds = cf['steps']['prepare_deployment']['commands'] - harness_cmd = next(cmd for cmd in cmds if 'harness-deployment' in cmd) - assert '../cloud-harness' not in harness_cmd, ( - f"harness-deployment command should not reference ../cloud-harness; got: {harness_cmd}" - ) - assert ' cloud-harness' in harness_cmd or harness_cmd.startswith('harness-deployment cloud-harness'), ( - f"harness-deployment command should reference cloud-harness (the cloned location); got: {harness_cmd}" - ) - - # working_directory in all build steps must not escape above the current directory - all_build_steps = {} - for step_name in [STEP_0, STEP_1, STEP_2, STEP_3]: - if step_name in cf['steps']: - all_build_steps.update(cf['steps'][step_name]['steps']) - - for step_name, step in all_build_steps.items(): - wd = step.get('working_directory', '') - assert not wd.startswith('../'), ( - f"Build step '{step_name}' working_directory must not start with '../'; got: {wd}" + for ch_root in [CLOUDHARNESS_ROOT, os.path.abspath(CLOUDHARNESS_ROOT)]: + # Create a sibling directory to simulate running from a different project + with tempfile.TemporaryDirectory(dir=os.path.dirname(CLOUDHARNESS_ROOT)) as tmp_project_dir: + old_cwd = os.getcwd() + try: + os.chdir(tmp_project_dir) + + values = create_helm_chart( + [ch_root, RESOURCES], + output_path=OUT, + include=['samples'], + domain="my.local", + namespace='test', + env='dev', + local=False, + tag=1, + registry='reg' ) - assert not wd.startswith('./../'), ( - f"Build step '{step_name}' working_directory must not start with './../'; got: {wd}" + + root_paths = preprocess_build_overrides( + root_paths=[ch_root, RESOURCES], + helm_values=values, + merge_build_path=BUILD_MERGE_DIR ) - finally: - os.chdir(old_cwd) - shutil.rmtree(BUILD_MERGE_DIR, ignore_errors=True) + + build_included = [app['harness']['name'] + for app in values['apps'].values() if 'harness' in app] + + cf = create_codefresh_deployment_scripts(root_paths, include=build_included, + envs=['dev'], + base_image_name=values['name'], + helm_values=values, save=False) + + # harness-deployment command must use cloud-harness, not the original path + cmds = cf['steps']['prepare_deployment']['commands'] + harness_cmd = next(cmd for cmd in cmds if 'harness-deployment' in cmd) + assert '../cloud-harness' not in harness_cmd, ( + f"harness-deployment command should not reference ../cloud-harness " + f"(ch_root={ch_root!r}); got: {harness_cmd}" + ) + assert os.path.abspath(ch_root) not in harness_cmd, ( + f"harness-deployment command should not contain the absolute path " + f"(ch_root={ch_root!r}); got: {harness_cmd}" + ) + assert 'cloud-harness' in harness_cmd, ( + f"harness-deployment command should reference cloud-harness " + f"(ch_root={ch_root!r}); got: {harness_cmd}" + ) + + # working_directory in all build steps must not escape cwd or use absolute paths + all_build_steps = {} + for step_name in [STEP_0, STEP_1, STEP_2, STEP_3]: + if step_name in cf['steps']: + all_build_steps.update(cf['steps'][step_name]['steps']) + + for step_name, step in all_build_steps.items(): + wd = step.get('working_directory', '') + assert not wd.startswith('../'), ( + f"Build step '{step_name}' working_directory must not start with '../' " + f"(ch_root={ch_root!r}); got: {wd}" + ) + assert not wd.startswith('./../'), ( + f"Build step '{step_name}' working_directory must not start with './../' " + f"(ch_root={ch_root!r}); got: {wd}" + ) + assert not os.path.isabs(wd), ( + f"Build step '{step_name}' working_directory must not be absolute " + f"(ch_root={ch_root!r}); got: {wd}" + ) + finally: + os.chdir(old_cwd) + shutil.rmtree(BUILD_MERGE_DIR, ignore_errors=True) def test_codefresh_working_directory_uses_cloned_cloud_harness(): """The working_directory for build steps that source images from cloud-harness must - use ./cloud-harness/... when cloud-harness is a sibling directory of the project.""" + use ./cloud-harness/... for any form of input path (relative, absolute, with ..).""" import tempfile - with tempfile.TemporaryDirectory(dir=os.path.dirname(CLOUDHARNESS_ROOT)) as tmp_project_dir: - old_cwd = os.getcwd() - try: - os.chdir(tmp_project_dir) - - values = create_helm_chart( - [CLOUDHARNESS_ROOT, RESOURCES], - output_path=OUT, - include=['samples'], - domain="my.local", - namespace='test', - env='dev', - local=False, - tag=1, - registry='reg' - ) - - root_paths = preprocess_build_overrides( - root_paths=[CLOUDHARNESS_ROOT, RESOURCES], - helm_values=values, - merge_build_path=BUILD_MERGE_DIR - ) - - build_included = [app['harness']['name'] - for app in values['apps'].values() if 'harness' in app] - - cf = create_codefresh_deployment_scripts(root_paths, include=build_included, - envs=['dev'], - base_image_name=values['name'], - helm_values=values, save=False) - - # cloudharness-base-images and common images come from the cloud-harness root; - # their working_directory must start with ./cloud-harness, not ./../cloud-harness - all_build_steps = {} - for step_name in [STEP_0, STEP_1, STEP_2, STEP_3]: - if step_name in cf['steps']: - all_build_steps.update(cf['steps'][step_name]['steps']) - - ch_steps = { - name: step for name, step in all_build_steps.items() - if 'cloudharness' in name or name == 'samples' - } - assert ch_steps, "Expected at least one cloud-harness image build step" + for ch_root in [CLOUDHARNESS_ROOT, os.path.abspath(CLOUDHARNESS_ROOT)]: + with tempfile.TemporaryDirectory(dir=os.path.dirname(CLOUDHARNESS_ROOT)) as tmp_project_dir: + old_cwd = os.getcwd() + try: + os.chdir(tmp_project_dir) + + values = create_helm_chart( + [ch_root, RESOURCES], + output_path=OUT, + include=['samples'], + domain="my.local", + namespace='test', + env='dev', + local=False, + tag=1, + registry='reg' + ) - for step_name, step in ch_steps.items(): - wd = step.get('working_directory', '') - assert not wd.startswith('../') and './../' not in wd, ( - f"Cloud-harness build step '{step_name}' working_directory must not " - f"escape the current directory with '../'; got: {wd}" + root_paths = preprocess_build_overrides( + root_paths=[ch_root, RESOURCES], + helm_values=values, + merge_build_path=BUILD_MERGE_DIR ) - finally: - os.chdir(old_cwd) - shutil.rmtree(BUILD_MERGE_DIR, ignore_errors=True) + + build_included = [app['harness']['name'] + for app in values['apps'].values() if 'harness' in app] + + cf = create_codefresh_deployment_scripts(root_paths, include=build_included, + envs=['dev'], + base_image_name=values['name'], + helm_values=values, save=False) + + all_build_steps = {} + for step_name in [STEP_0, STEP_1, STEP_2, STEP_3]: + if step_name in cf['steps']: + all_build_steps.update(cf['steps'][step_name]['steps']) + + ch_steps = { + name: step for name, step in all_build_steps.items() + if 'cloudharness' in name or name == 'samples' + } + assert ch_steps, "Expected at least one cloud-harness image build step" + + for step_name, step in ch_steps.items(): + wd = step.get('working_directory', '') + assert not wd.startswith('../') and './../' not in wd, ( + f"Cloud-harness build step '{step_name}' working_directory must not " + f"escape the current directory with '../' (ch_root={ch_root!r}); got: {wd}" + ) + assert not os.path.isabs(wd), ( + f"Cloud-harness build step '{step_name}' working_directory must not " + f"be absolute (ch_root={ch_root!r}); got: {wd}" + ) + finally: + os.chdir(old_cwd) + shutil.rmtree(BUILD_MERGE_DIR, ignore_errors=True)